From cbfd6528ff286c10b8354fc9e43337c496f5f2b2 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 27 Sep 2023 10:21:25 -0400 Subject: [PATCH 001/198] . --- include/flexflow/batch_config.h | 3 +- include/flexflow/config.h | 9 +- include/flexflow/layer.h | 2 +- include/flexflow/model.h | 1 + include/flexflow/op_meta.h | 3 +- include/flexflow/operator.h | 10 +- include/flexflow/ops/experts.h | 2 +- include/flexflow/ops/kernels/linear_kernels.h | 21 ++++ include/flexflow/ops/linear.h | 9 ++ .../ops/tree_inc_multihead_self_attention.h | 2 +- include/flexflow/utils/memory_allocator.h | 5 + src/ops/arg_topk.cc | 2 +- src/ops/argmax.cc | 6 +- src/ops/beam_topk.cc | 2 +- src/ops/conv_2d.cc | 13 +- src/ops/element_binary.cc | 10 +- src/ops/experts.cc | 4 +- src/ops/experts.cpp | 2 +- src/ops/experts.cu | 8 +- src/ops/fused.cpp | 2 +- src/ops/fused.cu | 2 +- src/ops/inc_multihead_self_attention.cc | 46 +++---- src/ops/inc_multihead_self_attention.cpp | 10 +- src/ops/inc_multihead_self_attention.cu | 10 +- src/ops/kernels/linear_kernels.cpp | 97 +++++++++++++++ src/ops/kernels/linear_kernels.cu | 117 ++++++++++++++++++ src/ops/linear.cc | 107 ++++++++++++++-- src/ops/sampling.cc | 2 +- src/ops/spec_inc_multihead_self_attention.cpp | 4 +- src/ops/spec_inc_multihead_self_attention.cu | 4 +- src/ops/tree_inc_multihead_self_attention.cpp | 28 ++--- src/ops/tree_inc_multihead_self_attention.cu | 28 ++--- src/runtime/batch_config.cc | 15 ++- src/runtime/inference_manager.cc | 6 +- src/runtime/model.cc | 27 +++- 35 files changed, 500 insertions(+), 119 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ce331d3e41..179e28c246 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -42,7 +42,8 @@ class BatchConfig { using TokenId = int; BatchConfig(); int num_active_requests() const; - int num_active_tokens() const; + int num_active_infr_tokens() const; + int num_active_peft_tokens() const; void print() const; virtual InferenceMode get_mode() const; static BatchConfig const *from_future(BatchConfigFuture const &future); diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 9716060173..e670bd72fb 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -15,7 +15,7 @@ #ifndef _FLEXFLOW_CONFIG_H_ #define _FLEXFLOW_CONFIG_H_ -#include "ffconst.h" +#include "flexflow/ffconst.h" #include "legion.h" #include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -64,6 +64,7 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS; #endif class FFConfig; +class MemoryAllocator; struct FFHandler { #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -77,6 +78,11 @@ struct FFHandler { size_t workSpaceSize; void *offload_reserve_space; size_t offload_reserve_space_size; + // PEFT related fields + void *peft_activation_reserve_space; + size_t peft_activation_reserve_space_size; + MemoryAllocator* peft_activation_allocator; + // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; #ifdef FF_USE_NCCL @@ -87,6 +93,7 @@ struct FFHandler { struct FFInitInfo { size_t workSpaceSize; size_t offload_reserve_space_size; + size_t peft_activation_reserve_space_size; DataType quantization_type; bool allowTensorOpMathConversion; // int myRank, allRanks; diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h index 0c1d7a6092..68d292dfe0 100644 --- a/include/flexflow/layer.h +++ b/include/flexflow/layer.h @@ -49,7 +49,7 @@ class Layer { Tensor outputs[MAX_NUM_OUTPUTS]; Tensor inputs[MAX_NUM_INPUTS]; Tensor weights[MAX_NUM_WEIGHTS]; - bool trainableInputs[MAX_NUM_INPUTS]; + //bool trainable_inputs[MAX_NUM_INPUTS]; int numInputs, numWeights, numOutputs; bool profiling; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index f88f96cd5a..763610e4cf 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -113,6 +113,7 @@ enum TaskIDs { LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, LINEAR_INF_TASK_ID, + LINEAR_PEFT_BWD_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID, LINEAR_BWD2_TASK_ID, diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h index 512844db92..3299201f43 100644 --- a/include/flexflow/op_meta.h +++ b/include/flexflow/op_meta.h @@ -15,7 +15,8 @@ class OpMeta { public: FFHandler handle; bool profiling; // Measure the run time of the task - bool trainableInputs[MAX_NUM_INPUTS]; + bool trainable_inputs[MAX_NUM_INPUTS]; + bool reset_input_grads[MAX_NUM_INPUTS]; DataType input_type[MAX_NUM_INPUTS]; DataType weight_type[MAX_NUM_WEIGHTS]; DataType output_type[MAX_NUM_OUTPUTS]; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1b2fc7bbfc..cce92a6bd8 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -224,6 +224,13 @@ class Op { MachineView const *mv = nullptr) { assert(false); }; + virtual Legion::FutureMap peft_bwd(FFModel const&, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) { + assert(false); + } virtual void print_layer(FFModel const &model) = 0; virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, @@ -311,7 +318,8 @@ class Op { ParallelTensor outputs[MAX_NUM_OUTPUTS]; ParallelTensor inputs[MAX_NUM_INPUTS]; ParallelParameter weights[MAX_NUM_WEIGHTS]; - bool trainableInputs[MAX_NUM_INPUTS]; + bool trainable_inputs[MAX_NUM_INPUTS]; + bool reset_input_grads[MAX_NUM_INPUTS]; OpMeta *meta[MAX_NUM_WORKERS]; std::map inference_meta; int numInputs, numWeights, numOutputs; diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index d68957d890..f132003d30 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -138,7 +138,7 @@ class Experts : public Op { float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim); diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index bbebe3c79b..8f32cb2e83 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -36,6 +36,8 @@ class LinearMeta : public OpMeta { bool use_bias, add_bias_only_once; char op_name[MAX_OPNAME]; Realm::RegionInstance reserveInst; + // PEFT related fields + void *output_activation_buffer; }; namespace Kernels { @@ -49,6 +51,14 @@ void forward_kernel_wrapper(LinearMeta const *m, int in_dim, int out_dim, int batch_size); +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens); void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -74,6 +84,16 @@ void forward_kernel(LinearMeta const *m, int batch_size, ffStream_t stream); template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream); +template void backward_kernel(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -86,6 +106,7 @@ void backward_kernel(LinearMeta const *m, int out_dim, int batch_size, ffStream_t stream); + template __global__ void build_one_ptr(DT *one_ptr, int batch_size); } // namespace Internal diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index 025674c7ba..9b926bec6c 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -52,6 +52,11 @@ class Linear : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * @@ -66,6 +71,10 @@ class Linear : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 6e2da19ce9..a6a801d0ad 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -144,7 +144,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { ~TreeIncMultiHeadSelfAttentionMeta(void); public: - int num_active_tokens; + int num_active_infr_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; }; diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h index 8e50a4c3b3..888d172a96 100644 --- a/include/flexflow/utils/memory_allocator.h +++ b/include/flexflow/utils/memory_allocator.h @@ -54,6 +54,11 @@ class MemoryAllocator { return static_cast
(ptr); } + inline void free_all() { + reserved_allocated_size = 0; + instance_allocated_size = 0; + } + public: Legion::Memory memory; void *reserved_ptr; diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index b877a9f96d..5aa34884f1 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -315,7 +315,7 @@ InferenceResult GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); ArgTopK::forward_kernel_wrapper(m, input, indices, batch_size); InferenceResult ir; diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 7863931c82..e8e2bd7609 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -345,7 +345,7 @@ BeamInferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); @@ -378,7 +378,7 @@ InferenceResult GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW parent; - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); InferenceResult ir; download_tensor( @@ -429,4 +429,4 @@ size_t hash::operator()( hash_combine(key, params.beam_search); return key; } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 93a6de5a8f..331f5c0d3d 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -389,7 +389,7 @@ BeamInferenceResult // total token nums // size_t tokens_per_request = in1_domain.hi()[1] - in1_domain.lo()[1] + 1; // size_t batch_size = in1_domain.get_volume() / length; - size_t batch_size = bc.num_active_tokens(); + size_t batch_size = bc.num_active_infr_tokens(); // std::vector beam_width; // std::unordered_map sub_requests = bc->sub_requests; // for (int i = 0; i < bc->MAX_NUM_REQUESTS; i++) { diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index ce7b6ebc01..db2819e83c 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -592,7 +592,8 @@ OpMeta *Conv2D::init_task(Task const *task, m->relu = conv->activation == AC_MODE_RELU; m->use_bias = conv->use_bias; m->profiling = conv->profiling; - m->trainableInputs[0] = conv->trainableInputs[0]; + m->trainable_inputs[0] = conv->trainable_inputs[0]; + m->reset_input_grads[0] = conv->trainable_inputs[0]; std::strcpy(m->op_name, conv->name); int input_w = acc_input.rect.hi[0] - acc_input.rect.lo[0] + 1; @@ -751,7 +752,7 @@ void Conv2D::backward(FFModel const &ff) { inputs[0]->region)); launcher.add_field(rid++, FID_DATA); // regions[1](I/O): input_grad - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -801,7 +802,7 @@ void Conv2D::backward(FFModel const &ff) { /* region(I): input - region(I/O): input_grad (if trainableInputs[0]) + region(I/O): input_grad (if trainable_inputs[0]) region(I): output region(I/O): output_grad region(I): filter @@ -814,17 +815,17 @@ void Conv2D::backward_task(Task const *task, Runtime *runtime) { // Conv2D* conv = (Conv2D*) task->args; Conv2DMeta const *m = *((Conv2DMeta **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + + assert(regions.size() == (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + + (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); size_t rid = 0; TensorAccessorR acc_input( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; float *acc_input_grad_ptr = NULL; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { TensorAccessorW acc_input_grad( regions[rid], task->regions[rid], diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 21edad11e3..4f4b55178e 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -416,7 +416,7 @@ OpMeta *ElementBinary::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); ElementBinaryMeta *m = new ElementBinaryMeta(handle, eb); for (int i = 0; i < eb->numInputs; i++) { - m->trainableInputs[i] = eb->trainableInputs[i]; + m->trainable_inputs[i] = eb->trainable_inputs[i]; } m->op_type = eb->op_type; m->profiling = eb->profiling; @@ -871,7 +871,7 @@ void ElementBinary::backward(FFModel const &ff) { inputs[0]->region)); launcher.add_field(rid++, FID_DATA); // regions[2](I/O): input0_grad - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement( RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, @@ -889,7 +889,7 @@ void ElementBinary::backward(FFModel const &ff) { inputs[1]->region)); launcher.add_field(rid++, FID_DATA); // regions[4](I/O): input1_grad - if (trainableInputs[1]) { + if (trainable_inputs[1]) { launcher.add_region_requirement( RegionRequirement(inputs[1]->part_grad, 0 /*projection id*/, @@ -959,7 +959,7 @@ void ElementBinary::backward_task(Task const *task, in0_ptr = helperGetTensorPointerRO( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { Domain in0_grad_domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); assert(in0_domain == in0_grad_domain); @@ -977,7 +977,7 @@ void ElementBinary::backward_task(Task const *task, in1_ptr = helperGetTensorPointerRO( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[1]) { + if (m->trainable_inputs[1]) { Domain in1_grad_domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); // assert(out_grad_domain == in1_domain); diff --git a/src/ops/experts.cc b/src/ops/experts.cc index c8b0ec0f26..6ce5fe82d9 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -670,7 +670,7 @@ FutureMap Experts::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv << std::endl; */ - // int num_active_tokens = bc->num_active_tokens(); + // int num_active_infr_tokens = bc->num_active_infr_tokens(); IndexLauncher launcher(EXPERTS_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -1058,7 +1058,7 @@ void Experts::inference_task(Task const *task, output_ptr, weights_ptr, bias_ptr, - bc->num_active_tokens(), + bc->num_active_infr_tokens(), chosen_experts, batch_size, out_dim); diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index c06f02a647..48536defd9 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -27,7 +27,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim) { diff --git a/src/ops/experts.cu b/src/ops/experts.cu index ce15cdff55..4e3ef6f12c 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -515,7 +515,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim) { @@ -529,8 +529,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaEventRecord(t_start, stream); } - assert(num_active_tokens > 0); - assert(num_active_tokens <= m->effective_batch_size); + assert(num_active_infr_tokens > 0); + assert(num_active_infr_tokens <= m->effective_batch_size); assert(m->effective_batch_size == batch_size); int num_experts_per_block = m->num_experts; @@ -540,7 +540,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int data_dim = m->data_dim; int num_chosen_experts = m->num_chosen_experts; // int num_tokens = m->effective_batch_size; - int num_tokens = num_active_tokens; + int num_tokens = num_active_infr_tokens; int expert_capacity = m->expert_capacity; assert(chosen_experts == num_chosen_experts); diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index f865c6dd2a..357b063a34 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -654,7 +654,7 @@ __host__ void } assert(m->input_type[0] == my_input_accessor[0].data_type); assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_tokens(); + batch_size = bc->num_active_infr_tokens(); Kernels::Linear::forward_kernel_wrapper(m, my_input_accessor[0].ptr, my_output_accessor[0].ptr, diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 13927e8ee6..efe55f31ac 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -683,7 +683,7 @@ __host__ void } assert(m->input_type[0] == my_input_accessor[0].data_type); assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_tokens(); + batch_size = bc->num_active_infr_tokens(); Kernels::Linear::forward_kernel_wrapper(m, my_input_accessor[0].ptr, my_output_accessor[0].ptr, diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 7cb9867312..ea0ba9b88d 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -906,7 +906,7 @@ void IncMultiHeadSelfAttention::inference_task( size_t effective_batch_size = max_sequence_length * batch_size; float inputs_arr[data_dim][effective_batch_size] = {0}; - for (size_t i = 0; i < data_dim * bc->num_active_tokens(); i++) { + for (size_t i = 0; i < data_dim * bc->num_active_infr_tokens(); i++) { size_t data_index = i % data_dim; size_t token_index = i / data_dim; assert(data_index < data_dim); @@ -938,11 +938,11 @@ void IncMultiHeadSelfAttention::inference_task( // column-major order. // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, " - // "bc->num_active_tokens(): %i, num_q_heads: %lli, + // "bc->num_active_infr_tokens(): %i, num_q_heads: %lli, // BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n", - // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_tokens(), + // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_infr_tokens(), // num_q_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); - // for (int t=0; t < bc->num_active_tokens(); t++) { + // for (int t=0; t < bc->num_active_infr_tokens(); t++) { // printf("token %i has request_index: %li and token_position: %li\n", // t, bc->token2ids.token_indexes[t].request_index, // bc->token2ids.token_indexes[t].token_position); @@ -1005,7 +1005,7 @@ void IncMultiHeadSelfAttention::inference_task( /* std::cout << "Torch projection weights size: " << torch_w_qkv.sizes() << std::endl; std::cout << "Torch input size: " << torch_input.sizes() << std::endl; - std::cout << "Number of active tokens: " << bc->num_active_tokens() + std::cout << "Number of active tokens: " << bc->num_active_infr_tokens() << std::endl; */ // std::cout << "torch_w_qkv:" << std::endl << torch_w_qkv << std::endl; @@ -1017,10 +1017,10 @@ void IncMultiHeadSelfAttention::inference_task( torch::Tensor qkv_projs = torch::einsum( "ijkl,im->jmkl", {torch_w_qkv, - torch_input.index({Slice(), Slice(0, bc->num_active_tokens())})}); + torch_input.index({Slice(), Slice(0, bc->num_active_infr_tokens())})}); // std::cout << "qkv_projs size: " << qkv_projs.sizes() << std::endl; assert(qkv_projs.sizes()[0] == m->qProjSize); - assert(qkv_projs.sizes()[1] == bc->num_active_tokens() && + assert(qkv_projs.sizes()[1] == bc->num_active_infr_tokens() && qkv_projs.sizes()[1] <= effective_batch_size); assert(qkv_projs.sizes()[2] == 3); assert(qkv_projs.sizes()[3] == num_q_heads); @@ -1033,25 +1033,25 @@ void IncMultiHeadSelfAttention::inference_task( assert(QKVProjArray_cpu != nullptr); std::vector QKVProjArray_converted_shape = { - m->qProjSize, bc->num_active_tokens(), 3, (int)num_q_heads}; + m->qProjSize, bc->num_active_infr_tokens(), 3, (int)num_q_heads}; float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc->num_active_tokens() * 3 * num_q_heads, sizeof(float)); + m->qProjSize * bc->num_active_infr_tokens() * 3 * num_q_heads, sizeof(float)); // skip over padding at the end of QKVProjArray_cpu // convert from column order to 3D matrix because torch cannot automatically // import matrices flattened in column order - for (size_t i = 0; i < proj_sum * bc->num_active_tokens() * num_q_heads; + for (size_t i = 0; i < proj_sum * bc->num_active_infr_tokens() * num_q_heads; i++) { int proj_size_index = i % m->qProjSize; - int head_index = i / (proj_sum * bc->num_active_tokens()); + int head_index = i / (proj_sum * bc->num_active_infr_tokens()); int token_index = - ((i - head_index * proj_sum * bc->num_active_tokens()) / m->qProjSize) % - bc->num_active_tokens(); - int qkv_offset = (i - head_index * proj_sum * bc->num_active_tokens()) / - (m->qProjSize * bc->num_active_tokens()); + ((i - head_index * proj_sum * bc->num_active_infr_tokens()) / m->qProjSize) % + bc->num_active_infr_tokens(); + int qkv_offset = (i - head_index * proj_sum * bc->num_active_infr_tokens()) / + (m->qProjSize * bc->num_active_infr_tokens()); assert(proj_size_index < proj_sum); assert(head_index < num_q_heads); - assert(token_index < bc->num_active_tokens()); + assert(token_index < bc->num_active_infr_tokens()); assert(qkv_offset < 3); set_value_row_major(QKVProjArray_converted, QKVProjArray_converted_shape, @@ -1060,7 +1060,7 @@ void IncMultiHeadSelfAttention::inference_task( } torch::Tensor QKVProjArray_torch = torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc->num_active_tokens(), 3, num_q_heads}, + {m->qProjSize, bc->num_active_infr_tokens(), 3, num_q_heads}, torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- @@ -1087,7 +1087,7 @@ void IncMultiHeadSelfAttention::inference_task( // ----------------------- C++ operations & checks -------------------------- // Store projections into k/v cache arrays for (size_t h = 0; h < num_q_heads; h++) { - for (size_t t = 0; t < bc->num_active_tokens(); t++) { + for (size_t t = 0; t < bc->num_active_infr_tokens(); t++) { for (size_t d = 0; d < m->kProjSize; d++) { size_t kcache_idx = d * MAX_SEQ_LEN * m->num_q_heads * BatchConfig::MAX_NUM_REQUESTS + @@ -1124,7 +1124,7 @@ void IncMultiHeadSelfAttention::inference_task( std::vector req_idxs; std::vector r_first_idx; std::vector r_num_tokens; - for (size_t t = 0; t < bc->num_active_tokens(); t++) { + for (size_t t = 0; t < bc->num_active_infr_tokens(); t++) { size_t rid = bc->tokensInfo[t].request_index; if (req_idxs.size() == 0 || req_idxs[req_idxs.size() - 1] != rid) { req_idxs.push_back(rid); @@ -1140,7 +1140,7 @@ void IncMultiHeadSelfAttention::inference_task( assert(std::accumulate(r_num_tokens.begin(), r_num_tokens.end(), decltype(r_num_tokens)::value_type(0)) == - bc->num_active_tokens()); + bc->num_active_infr_tokens()); // ----------------------- Loading CUDA results for this step --------------- float *keyCache_cpu = @@ -1375,7 +1375,7 @@ void IncMultiHeadSelfAttention::inference_task( torch::Tensor attn_heads[bc->num_active_requests()]; torch::Tensor cpp_output = - torch::zeros({m->oProjSize, bc->num_active_tokens()}); + torch::zeros({m->oProjSize, bc->num_active_infr_tokens()}); // ----------------------- Loading CUDA results for this step --------------- float *qk_prods_cpu = download_tensor( @@ -1595,12 +1595,12 @@ void IncMultiHeadSelfAttention::inference_task( std::cout << "CUDA:" <oProjSize; i++) { std::cout << torch_out_cuda.index({i, Slice(0, - (int64_t)bc->num_active_tokens())}) << std::endl; + (int64_t)bc->num_active_infr_tokens())}) << std::endl; } */ assert(torch::allclose( torch_out_cuda.index( - {Slice(), Slice(0, (int64_t)bc->num_active_tokens())}), + {Slice(), Slice(0, (int64_t)bc->num_active_infr_tokens())}), cpp_output, 1e-05, 1e-05)); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 8fb635bace..98a101b723 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -305,7 +305,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int m_k = m->kProjSize; int m_v = m->vProjSize; assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); + int n = bc->num_active_infr_tokens(); int k = m->qSize; int m_ = m_q; int lda = k, ldb = k, ldc = m_q; @@ -342,7 +342,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, // apply rotary emmmbedding for q and k // step1 change the k, v to complex tensor - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; int q_block_size = m->qProjSize * num_tokens; int k_block_size = m->kProjSize * num_tokens; @@ -407,7 +407,7 @@ template void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, hipStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); if (num_tokens > 0) { int parallelism = (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; @@ -508,7 +508,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, } checkCUDA(hipMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->num_active_tokens() * + bc->num_active_infr_tokens() * sizeof(BatchConfig::PerTokenInfo), hipMemcpyHostToDevice, stream)); @@ -573,7 +573,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; int q_block_size = m->qProjSize * num_tokens; int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index ec776f4cda..710d20240b 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -281,7 +281,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int m_k = m->kProjSize; int m_v = m->vProjSize; assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); + int n = bc->num_active_infr_tokens(); int k = m->qSize; int m_ = m_q; int lda = k, ldb = k, ldc = m_q; @@ -317,7 +317,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // apply rotary emmmbedding for q and k // step1 change the k, v to complex tensor - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; int q_block_size = m->qProjSize * num_tokens; int k_block_size = m->kProjSize * num_tokens; @@ -376,7 +376,7 @@ template void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); if (num_tokens > 0) { int parallelism = (m->kProjSize + m->vProjSize) * num_tokens * m->num_kv_heads; @@ -475,7 +475,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, } cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->num_active_tokens() * sizeof(BatchConfig::PerTokenInfo), + bc->num_active_infr_tokens() * sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens @@ -576,7 +576,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; int q_block_size = m->qProjSize * num_tokens; int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 231ca0f3d7..5f756c8f5c 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -143,6 +143,40 @@ void forward_kernel_wrapper(LinearMeta const *m, } } +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *weight_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } +} + + void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -317,6 +351,69 @@ void forward_kernel(LinearMeta const *m, } } +template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + DT alpha = 1.0f; + hipDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + // update input_grad_ptr offset + input_grad_ptr = static_cast(input_grad_ptr) + num_infr_tokens; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + hipblasDatatype_t compute_type = HIPBLAS_R_32F; +#endif + int output_size = out_dim * num_peft_tokens; + if (m->activation == AC_MODE_RELU) { + relu_backward_kernel( + m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + } else if (m->activation == AC_MODE_SIGMOID) { + sigmoid_backward_kernel( + m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + } else { + // TODO: only support relu and sigmoid for now + assert(m->activation == AC_MODE_NONE); + } + + // Compute data gradiant + // NOTE: we use alpha=1 for input_grad to accumulate gradients + if (input_grad_ptr != NULL) { + checkCUDA(hipblasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + out_dim, + &alpha, + kernel_ptr, + weight_type, + in_dim, + output_grad_ptr, + output_type, + out_dim, + &alpha, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } +} + + template void backward_kernel(LinearMeta const *m, void const *input_ptr, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 8a93357dcf..4ac6bc253f 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -170,6 +170,61 @@ void forward_kernel_wrapper(LinearMeta const *m, } } +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *weight_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[Linear:forward:input]"); print_tensor((float*)weight_ptr, in_dim + // * out_dim, "[Linear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[Linear:forward:output]"); + } +} + + void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -380,6 +435,68 @@ void forward_kernel(LinearMeta const *m, } } +template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + DT alpha = 1.0f; + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + // update input_grad_ptr offset + input_grad_ptr = static_cast(input_grad_ptr) + num_infr_tokens; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + int output_size = out_dim * num_peft_tokens; + if (m->activation == AC_MODE_RELU) { + relu_backward_kernel( + m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + } else if (m->activation == AC_MODE_SIGMOID) { + sigmoid_backward_kernel( + m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + } else { + // TODO: only support relu and sigmoid for now + assert(m->activation == AC_MODE_NONE); + } + + // Compute data gradiant + // NOTE: we use alpha=1 for input_grad to accumulate gradients + if (input_grad_ptr != NULL) { + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + out_dim, + &alpha, + kernel_ptr, + weight_type, + in_dim, + output_grad_ptr, + output_type, + out_dim, + &alpha, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } +} + template void backward_kernel(LinearMeta const *m, void const *input_ptr, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index a751ebcc57..f6de5186ad 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -504,7 +504,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->use_bias = linear->use_bias; m->add_bias_only_once = linear->add_bias_only_once; m->profiling = linear->profiling; - m->trainableInputs[0] = linear->trainableInputs[0]; + m->trainable_inputs[0] = linear->trainable_inputs[0]; m->weight_ptr_type = m->input_type[0]; m->quantization_type = linear->quantization_type; m->offload = linear->offload; @@ -638,7 +638,7 @@ void Linear::inference_task(Task const *task, int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); GenericTensorAccessorR bias; if (m->use_bias && !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) { @@ -660,6 +660,99 @@ void Linear::inference_task(Task const *task, batch_size); } +FutureMap Linear::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(2, FID_DATA); + if (use_bias) { + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +void Linear::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + LinearMeta const *m = *((LinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + assert(regions.size() == (3 + static_cast(m->use_bias))); + assert(task->regions.size() == (3 + static_cast(m->use_bias))); + if (m->quantization_type == DT_NONE) { + assert(m->input_type[0] == m->weight_type[0]); + } + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + peft_bwd_kernel_wrapper(m, + input_grad.ptr, + output_grad.ptr, + weight.ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); +} + void Linear::forward_task(Task const *task, std::vector const ®ions, Context ctx, @@ -775,7 +868,7 @@ void Linear::backward(FFModel const &ff) { launcher.add_field(rid++, FID_DATA); // regions[1](I/O): replica_grad assert(replica == NULL); - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement( RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, @@ -871,17 +964,17 @@ void Linear::backward_task_with_dim(Task const *task, Runtime *runtime) { // Linear* linear = (Linear*) task->args; LinearMeta const *m = *((LinearMeta **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + + assert(regions.size() == (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + + (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); DT *input_grad = nullptr; size_t rid = 0; TensorAccessorR acc_input( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { Domain domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); if (domain.get_dim() == NDIM + 1) { @@ -1157,7 +1250,7 @@ bool Linear::measure_operator_cost(Simulator *sim, }; if (sim->computationMode == COMP_MODE_TRAINING) { void *input_grad_ptr = NULL; - if (trainableInputs[0]) { + if (trainable_inputs[0]) { input_grad_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type); } else { diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 6eb62b2933..f597b9b6b0 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -299,7 +299,7 @@ InferenceResult GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); Sampling::forward_kernel_wrapper(m, input, indices, batch_size); InferenceResult ir; diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index f983238198..3b2b44401e 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -150,7 +150,7 @@ template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, hipStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); @@ -218,7 +218,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; // int qkv_block_size = diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 6ef5145654..2e9a558d6f 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -168,7 +168,7 @@ template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); @@ -234,7 +234,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; // int qkv_block_size = diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 0fa68bed08..755466a727 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -41,7 +41,7 @@ __global__ void commit_tokens_kernel( int kProjSize, int vProjSize, int num_tokens_to_commit, - int num_active_tokens_in_last_batch, + int num_active_infr_tokens_in_last_batch, int num_q_heads, int num_kv_heads, int max_seq_len) { @@ -58,16 +58,16 @@ __global__ void commit_tokens_kernel( int token_pos = (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; - assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); + assert(token_idx_in_last_batch < num_active_infr_tokens_in_last_batch); int q_array_size = - qProjSize * num_active_tokens_in_last_batch * num_q_heads; + qProjSize * num_active_infr_tokens_in_last_batch * num_q_heads; int k_array_size = - kProjSize * num_active_tokens_in_last_batch * num_kv_heads; + kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads; DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_active_tokens_in_last_batch + + head_idx * proj_size * num_active_infr_tokens_in_last_batch + token_idx_in_last_batch * proj_size + data_idx]; int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; @@ -101,7 +101,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch + m->num_active_infr_tokens, // number of active tokens in previous batch m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); @@ -193,8 +193,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); - int q_block_size = m->qProjSize * bc->num_active_tokens(); + // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens(); + int q_block_size = m->qProjSize * bc->num_active_infr_tokens(); int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; int kt_req_block_size = kt_block_size * m->num_kv_heads; int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; @@ -238,7 +238,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); @@ -517,7 +517,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } - assert(processed_tokens_in_batch == bc->num_active_tokens()); + assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); } template @@ -546,7 +546,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } } // copy committed tokens info to GPU for the commit_tokens kernel - // Note that m->num_active_tokens stores the number of active + // Note that m->num_active_infr_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache checkCUDA( @@ -558,9 +558,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, stream)); commit_tokens
(m, bc, stream); - // After commit we update m->num_active_tokens to be the number of active + // After commit we update m->num_active_infr_tokens to be the number of active // tokens for the current batch - m->num_active_tokens = bc->num_active_tokens(); + m->num_active_infr_tokens = bc->num_active_infr_tokens(); // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { @@ -707,7 +707,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( _num_kv_heads, attn->quantization_type, attn->offload), - num_active_tokens(0) { + num_active_infr_tokens(0) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 95ac93ad8a..30ed4e54eb 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -41,7 +41,7 @@ __global__ void commit_tokens_kernel( int kProjSize, int vProjSize, int num_tokens_to_commit, - int num_active_tokens_in_last_batch, + int num_active_infr_tokens_in_last_batch, int num_q_heads, int num_kv_heads, int max_seq_len) { @@ -58,16 +58,16 @@ __global__ void commit_tokens_kernel( int token_pos = (real_i - head_idx * (num_tokens_to_commit * proj_size)) / proj_size; int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; - assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); + assert(token_idx_in_last_batch < num_active_infr_tokens_in_last_batch); int q_array_size = - qProjSize * num_active_tokens_in_last_batch * num_q_heads; + qProjSize * num_active_infr_tokens_in_last_batch * num_q_heads; int k_array_size = - kProjSize * num_active_tokens_in_last_batch * num_kv_heads; + kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads; DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_active_tokens_in_last_batch + + head_idx * proj_size * num_active_infr_tokens_in_last_batch + token_idx_in_last_batch * proj_size + data_idx]; int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; @@ -99,7 +99,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch + m->num_active_infr_tokens, // number of active tokens in previous batch m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); @@ -191,8 +191,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); - int q_block_size = m->qProjSize * bc->num_active_tokens(); + // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens(); + int q_block_size = m->qProjSize * bc->num_active_infr_tokens(); int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; int kt_req_block_size = kt_block_size * m->num_kv_heads; int vt_block_size = m->vProjSize * BatchConfig::MAX_SEQ_LENGTH; @@ -234,7 +234,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); @@ -515,7 +515,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } - assert(processed_tokens_in_batch == bc->num_active_tokens()); + assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); } template @@ -544,7 +544,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } } // copy committed tokens info to GPU for the commit_tokens kernel - // Note that m->num_active_tokens stores the number of active + // Note that m->num_active_infr_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache cudaMemcpyAsync(m->committed_token_infos, @@ -555,9 +555,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, stream); commit_tokens
(m, bc, stream); - // After commit we update m->num_active_tokens to be the number of active + // After commit we update m->num_active_infr_tokens to be the number of active // tokens for the current batch - m->num_active_tokens = bc->num_active_tokens(); + m->num_active_infr_tokens = bc->num_active_infr_tokens(); // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { @@ -704,7 +704,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( _num_kv_heads, attn->quantization_type, attn->offload), - num_active_tokens(0) { + num_active_infr_tokens(0) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index d658b6590f..4da520ea97 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -24,7 +24,7 @@ LegionRuntime::Logger::Category log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; -BatchConfig::BatchConfig() : num_tokens(0) { +BatchConfig::BatchConfig() : num_infr_tokens(0), num_peft_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { requestsInfo[i].token_start_offset = 0; requestsInfo[i].num_tokens_in_batch = 0; @@ -68,8 +68,12 @@ int BatchConfig::num_active_requests() const { return num_requests; } -int BatchConfig::num_active_tokens() const { - return num_tokens; +int BatchConfig::num_active_infr_tokens() const { + return num_infr_tokens; +} + +int BatchConfig::num_active_peft_tokens() const { + return num_peft_tokens; } void BatchConfig::print() const { @@ -77,7 +81,8 @@ void BatchConfig::print() const { << ") @@@@@@@@@@@@@@" << std::endl; std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; - std::cout << "Number of tokens: " << num_tokens << std::endl; + std::cout << "Number of infr tokens: " << num_infr_tokens << std::endl; + std::cout << "Number of peft tokens: " << num_peft_tokens << std::endl; std::cout << "Number of requests: " << num_active_requests() << std::endl; // std::cout << "Cached results: " << cached_results << std::endl; @@ -98,7 +103,7 @@ void BatchConfig::print() const { } std::cout << "Per-token info:\n"; - for (int i = 0; i < num_tokens; i++) { + for (int i = 0; i < num_infr_tokens + num_peft_tokens; i++) { std::cout << " Token " << i << ":\n"; std::cout << " Absolute depth in request: " << tokensInfo[i].abs_depth_in_request << std::endl; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index f36dcb2922..dc1a9f6611 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -292,11 +292,11 @@ FutureMap InferenceManager::inference(FFModel *model, FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfigFuture const &bc) { - // log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)", + // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d) num_active_requests(%d)", // bc.get_mode(), - // bc.num_active_tokens(), + // bc.num_active_infr_tokens(), // bc.num_active_requests()); - // assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); + // assert(bc.num_active_infr_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) int batch_index = index % model->config.data_parallelism_degree; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 0cb50733a3..48fe5c4fe8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -144,8 +144,8 @@ Op::Op(FFModel &model, inputs[i] = tensors[i]; } for (int i = 0; i < numInputs; i++) { - trainableInputs[i] = true; - // resetInputGrads[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_OUTPUTS; i++) { outputs[i] = nullptr; @@ -188,8 +188,8 @@ Op::Op(FFModel &model, } } for (int i = 0; i < numInputs; i++) { - trainableInputs[i] = true; - // resetInputGrads[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_OUTPUTS; i++) { outputs[i] = NULL; @@ -1463,7 +1463,8 @@ bool Op::get_weight_parameter(TNParameter tnp, OpMeta::OpMeta(FFHandler _handle) : handle(_handle), profiling(false) { for (int i = 0; i < MAX_NUM_INPUTS; i++) { - trainableInputs[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_INPUTS; i++) { input_type[i] = DT_NONE; @@ -3447,7 +3448,7 @@ void FFModel::compile(LossType loss_type, for (int i = 0; i < op->numInputs; i++) { assert(op->inputs[i]->owner_op != nullptr); if (op->inputs[i]->owner_op->op_type == OP_INPUT) { - op->trainableInputs[i] = false; + op->trainable_inputs[i] = false; } } } @@ -5364,6 +5365,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID, "Linear PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Linear PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); From 60702fc74309a9c446f7ab78abc50e112e16831a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 27 Sep 2023 10:45:44 -0400 Subject: [PATCH 002/198] format --- include/flexflow/config.h | 2 +- include/flexflow/layer.h | 2 +- include/flexflow/operator.h | 2 +- src/ops/inc_multihead_self_attention.cc | 24 +++++---- src/ops/inc_multihead_self_attention.cu | 3 +- src/ops/kernels/linear_kernels.cpp | 54 ++++++++++--------- src/ops/kernels/linear_kernels.cu | 17 +++--- src/ops/tree_inc_multihead_self_attention.cpp | 13 ++--- src/ops/tree_inc_multihead_self_attention.cu | 13 ++--- src/runtime/batch_config.cc | 11 ++-- src/runtime/inference_manager.cc | 3 +- src/runtime/model.cc | 3 +- 12 files changed, 82 insertions(+), 65 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index e670bd72fb..1d74a38468 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -81,7 +81,7 @@ struct FFHandler { // PEFT related fields void *peft_activation_reserve_space; size_t peft_activation_reserve_space_size; - MemoryAllocator* peft_activation_allocator; + MemoryAllocator *peft_activation_allocator; // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h index 68d292dfe0..9865501f5f 100644 --- a/include/flexflow/layer.h +++ b/include/flexflow/layer.h @@ -49,7 +49,7 @@ class Layer { Tensor outputs[MAX_NUM_OUTPUTS]; Tensor inputs[MAX_NUM_INPUTS]; Tensor weights[MAX_NUM_WEIGHTS]; - //bool trainable_inputs[MAX_NUM_INPUTS]; + // bool trainable_inputs[MAX_NUM_INPUTS]; int numInputs, numWeights, numOutputs; bool profiling; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index cce92a6bd8..32e66e4e72 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -224,7 +224,7 @@ class Op { MachineView const *mv = nullptr) { assert(false); }; - virtual Legion::FutureMap peft_bwd(FFModel const&, + virtual Legion::FutureMap peft_bwd(FFModel const &, BatchConfigFuture const &, std::vector const &, std::vector const &, diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index ea0ba9b88d..1484c424bb 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -940,8 +940,9 @@ void IncMultiHeadSelfAttention::inference_task( // printf("m->kProjSize: %i, BatchConfig::MAX_NUM_TOKENS: %i, " // "bc->num_active_infr_tokens(): %i, num_q_heads: %lli, // BatchConfig::MAX_NUM_REQUESTS: %i, " "bc->num_active_requests(): %i\n", - // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, bc->num_active_infr_tokens(), - // num_q_heads, BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); + // m->kProjSize, BatchConfig::MAX_NUM_TOKENS, + // bc->num_active_infr_tokens(), num_q_heads, + // BatchConfig::MAX_NUM_REQUESTS, bc->num_active_requests()); // for (int t=0; t < bc->num_active_infr_tokens(); t++) { // printf("token %i has request_index: %li and token_position: %li\n", // t, bc->token2ids.token_indexes[t].request_index, @@ -1035,7 +1036,8 @@ void IncMultiHeadSelfAttention::inference_task( std::vector QKVProjArray_converted_shape = { m->qProjSize, bc->num_active_infr_tokens(), 3, (int)num_q_heads}; float *QKVProjArray_converted = (float *)calloc( - m->qProjSize * bc->num_active_infr_tokens() * 3 * num_q_heads, sizeof(float)); + m->qProjSize * bc->num_active_infr_tokens() * 3 * num_q_heads, + sizeof(float)); // skip over padding at the end of QKVProjArray_cpu // convert from column order to 3D matrix because torch cannot automatically @@ -1045,10 +1047,12 @@ void IncMultiHeadSelfAttention::inference_task( int proj_size_index = i % m->qProjSize; int head_index = i / (proj_sum * bc->num_active_infr_tokens()); int token_index = - ((i - head_index * proj_sum * bc->num_active_infr_tokens()) / m->qProjSize) % + ((i - head_index * proj_sum * bc->num_active_infr_tokens()) / + m->qProjSize) % bc->num_active_infr_tokens(); - int qkv_offset = (i - head_index * proj_sum * bc->num_active_infr_tokens()) / - (m->qProjSize * bc->num_active_infr_tokens()); + int qkv_offset = + (i - head_index * proj_sum * bc->num_active_infr_tokens()) / + (m->qProjSize * bc->num_active_infr_tokens()); assert(proj_size_index < proj_sum); assert(head_index < num_q_heads); assert(token_index < bc->num_active_infr_tokens()); @@ -1058,10 +1062,10 @@ void IncMultiHeadSelfAttention::inference_task( {proj_size_index, token_index, qkv_offset, head_index}, QKVProjArray_cpu[i]); } - torch::Tensor QKVProjArray_torch = - torch::from_blob(QKVProjArray_converted, - {m->qProjSize, bc->num_active_infr_tokens(), 3, num_q_heads}, - torch::kFloat32); + torch::Tensor QKVProjArray_torch = torch::from_blob( + QKVProjArray_converted, + {m->qProjSize, bc->num_active_infr_tokens(), 3, num_q_heads}, + torch::kFloat32); // ----------------------- Comparing C++ & CUDA results --------------------- // std::cout << "QKVProjArray_torch" << std::endl; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 710d20240b..a3061c4c8e 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -475,7 +475,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, } cudaMemcpyAsync(m->token_infos, &(bc->tokensInfo), - bc->num_active_infr_tokens() * sizeof(BatchConfig::PerTokenInfo), + bc->num_active_infr_tokens() * + sizeof(BatchConfig::PerTokenInfo), cudaMemcpyHostToDevice, stream); // phase 1: Implement kernel to compute KQV for input tokens diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 5f756c8f5c..87b39126c5 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -176,7 +176,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, } } - void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -369,7 +368,7 @@ void peft_bwd_kernel(LinearMeta const *m, hipDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); // update input_grad_ptr offset - input_grad_ptr = static_cast(input_grad_ptr) + num_infr_tokens; + input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens; #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; @@ -378,11 +377,17 @@ void peft_bwd_kernel(LinearMeta const *m, #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { - relu_backward_kernel( - m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + relu_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); } else if (m->activation == AC_MODE_SIGMOID) { - sigmoid_backward_kernel( - m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + sigmoid_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); @@ -392,28 +397,27 @@ void peft_bwd_kernel(LinearMeta const *m, // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(hipblasGemmEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_N, - in_dim, - num_peft_tokens, - out_dim, - &alpha, - kernel_ptr, - weight_type, - in_dim, - output_grad_ptr, - output_type, - out_dim, - &alpha, - input_grad_ptr, - input_type, - in_dim, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + out_dim, + &alpha, + kernel_ptr, + weight_type, + in_dim, + output_grad_ptr, + output_type, + out_dim, + &alpha, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } - template void backward_kernel(LinearMeta const *m, void const *input_ptr, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 4ac6bc253f..0f60bfe17b 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -224,7 +224,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, } } - void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -453,7 +452,7 @@ void peft_bwd_kernel(LinearMeta const *m, cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); // update input_grad_ptr offset - input_grad_ptr = static_cast(input_grad_ptr) + num_infr_tokens; + input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens; #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; @@ -462,11 +461,17 @@ void peft_bwd_kernel(LinearMeta const *m, #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { - relu_backward_kernel( - m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + relu_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); } else if (m->activation == AC_MODE_SIGMOID) { - sigmoid_backward_kernel( - m->output_type[0], output_grad_ptr, m->output_activation_buffer, output_size, stream); + sigmoid_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 755466a727..9866cc11d6 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -65,10 +65,10 @@ __global__ void commit_tokens_kernel( int k_array_size = kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads; - DT val = - devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_active_infr_tokens_in_last_batch + - token_idx_in_last_batch * proj_size + data_idx]; + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * + num_active_infr_tokens_in_last_batch + + token_idx_in_last_batch * proj_size + data_idx]; int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; @@ -193,7 +193,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens(); + // (m->qProjSize + m->kProjSize + m->vProjSize) * + // bc->num_active_infr_tokens(); int q_block_size = m->qProjSize * bc->num_active_infr_tokens(); int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; int kt_req_block_size = kt_block_size * m->num_kv_heads; @@ -238,7 +239,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_infr_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 30ed4e54eb..adff421e86 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -65,10 +65,10 @@ __global__ void commit_tokens_kernel( int k_array_size = kProjSize * num_active_infr_tokens_in_last_batch * num_kv_heads; - DT val = - devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + - head_idx * proj_size * num_active_infr_tokens_in_last_batch + - token_idx_in_last_batch * proj_size + data_idx]; + DT val = devQKVProjArray[q_array_size + (k_cache ? 0 : k_array_size) + + head_idx * proj_size * + num_active_infr_tokens_in_last_batch + + token_idx_in_last_batch * proj_size + data_idx]; int const req_id = committedTokenInfos[token_pos].request_index; int const tok_id = committedTokenInfos[token_pos].token_depth; @@ -191,7 +191,8 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_infr_tokens(); + // (m->qProjSize + m->kProjSize + m->vProjSize) * + // bc->num_active_infr_tokens(); int q_block_size = m->qProjSize * bc->num_active_infr_tokens(); int kt_block_size = m->kProjSize * BatchConfig::MAX_SEQ_LENGTH; int kt_req_block_size = kt_block_size * m->num_kv_heads; @@ -234,7 +235,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_infr_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch m->num_q_heads, m->num_kv_heads, BatchConfig::MAX_SEQ_LENGTH); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 4da520ea97..0015d958d5 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -24,7 +24,7 @@ LegionRuntime::Logger::Category log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; -BatchConfig::BatchConfig() : num_infr_tokens(0), num_peft_tokens(0) { +BatchConfig::BatchConfig() : num_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { requestsInfo[i].token_start_offset = 0; requestsInfo[i].num_tokens_in_batch = 0; @@ -69,11 +69,11 @@ int BatchConfig::num_active_requests() const { } int BatchConfig::num_active_infr_tokens() const { - return num_infr_tokens; + return num_tokens; } int BatchConfig::num_active_peft_tokens() const { - return num_peft_tokens; + return 0; } void BatchConfig::print() const { @@ -81,8 +81,7 @@ void BatchConfig::print() const { << ") @@@@@@@@@@@@@@" << std::endl; std::cout << "Max number of requests: " << MAX_NUM_REQUESTS << std::endl; std::cout << "Max number of tokens: " << MAX_NUM_TOKENS << std::endl; - std::cout << "Number of infr tokens: " << num_infr_tokens << std::endl; - std::cout << "Number of peft tokens: " << num_peft_tokens << std::endl; + std::cout << "Number of infr tokens: " << num_tokens << std::endl; std::cout << "Number of requests: " << num_active_requests() << std::endl; // std::cout << "Cached results: " << cached_results << std::endl; @@ -103,7 +102,7 @@ void BatchConfig::print() const { } std::cout << "Per-token info:\n"; - for (int i = 0; i < num_infr_tokens + num_peft_tokens; i++) { + for (int i = 0; i < num_tokens; i++) { std::cout << " Token " << i << ":\n"; std::cout << " Absolute depth in request: " << tokensInfo[i].abs_depth_in_request << std::endl; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index dc1a9f6611..584b8cab4c 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -292,7 +292,8 @@ FutureMap InferenceManager::inference(FFModel *model, FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfigFuture const &bc) { - // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d) num_active_requests(%d)", + // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d) + // num_active_requests(%d)", // bc.get_mode(), // bc.num_active_infr_tokens(), // bc.num_active_requests()); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2f457cccf5..e1a40ca991 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5406,7 +5406,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID, "Linear PEFT Backward"); + TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID, + "Linear PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { From da9ce1be7ef9ad2ae624d0988f094a8feee4713a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 3 Oct 2023 17:57:38 -0400 Subject: [PATCH 003/198] implement LoraLinear --- include/flexflow/batch_config.h | 1 + include/flexflow/ffconst.h | 2 + include/flexflow/model.h | 17 + include/flexflow/operator_params.h | 2 + include/flexflow/ops/lora_linear.h | 112 +++++ src/ops/inc_multihead_self_attention.cc | 4 +- src/ops/kernels/linear_kernels.cu | 6 +- src/ops/kernels/lora_linear_kernels.cu | 373 +++++++++++++++ src/ops/lora_linear.cc | 599 ++++++++++++++++++++++++ src/runtime/batch_config.cc | 4 + src/runtime/model.cc | 49 ++ 11 files changed, 1166 insertions(+), 3 deletions(-) create mode 100644 include/flexflow/ops/lora_linear.h create mode 100644 src/ops/kernels/lora_linear_kernels.cu create mode 100644 src/ops/lora_linear.cc diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 427b2ec3ec..fc243fb365 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -42,6 +42,7 @@ class BatchConfig { using TokenId = int; BatchConfig(); int num_active_requests() const; + int num_active_tokens() const; int num_active_infr_tokens() const; int num_active_peft_tokens() const; static int max_requests_per_batch(); diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 124b46862a..37a178d952 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -172,6 +172,8 @@ enum OperatorType { OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, + // PEFT Ops + OP_LORA_LINEAR, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 526332340b..105c678ba9 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -120,6 +120,9 @@ enum TaskIDs { LINEAR_BWD_TASK_ID, LINEAR_BWD2_TASK_ID, LINEAR_UPD_TASK_ID, + LORA_LINEAR_INIT_TASK_ID, + LORA_LINEAR_INF_TASK_ID, + LORA_LINEAR_PEFT_BWD_TASK_ID, FLAT_INIT_TASK_ID, FLAT_FWD_TASK_ID, FLAT_BWD_TASK_ID, @@ -322,6 +325,7 @@ class ResidualLayerNorm; class AddBiasResidualLayerNorm; class SigmoidSiluMulti; class Linear; +class LoraLinear; class MultiHeadAttention; class IncMultiHeadSelfAttention; class TreeIncMultiHeadSelfAttention; @@ -801,6 +805,15 @@ class FFModel { bool position_bias = false, char const *name = NULL); // ======================================== + // PEFT Layers + // ======================================== + void lora_linear(Tensor const input, + Tensor const output, + int rank, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = nullptr, + char const *name = nullptr); + // ======================================== // Inference APIs // ======================================== GenerationResult generate(std::vector &prompts, @@ -1179,6 +1192,10 @@ class FFModel { SigmoidSiluMulti *>, std::unordered_map, Linear *>, + std::unordered_map< + std::pair, + LoraLinearParams>, + LoraLinear *>, std::unordered_map, Pool2D *>, std::unordered_map; + + LoraLinear(FFModel &model, + LayerID const &layer_guid, + ParallelTensor const input, + ParallelTensor const output, + int rank, + DataType _data_type, + bool allocate_weights, + char const *name); + LoraLinear(FFModel &model, + LoraLinear const &other, + ParallelTensor const input, + ParallelTensor const output, + bool allocate_weights); + LoraLinear(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights = false, + char const *name = nullptr); + + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override; + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + + // size_t get_params_hash() const override; + LoraLinearParams get_params() const; + +private: + LoraLinear(int guid, + bool profiling, + ParallelTensor const input, + ParallelTensor const output, + int rank, + bool allocate_weights, + char const *name); + + void register_mappings(); + void register_output_mappings(); + void register_weight_mappings(); + +public: + int rank; +}; + +}; // namespace FlexFlow + +#endif // _FLEXLOW_LORA_LINEAR_FIRST_H diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index bb444ea0ab..1978497c14 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -365,7 +365,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, nullptr /*owner_op*/, - true /*create_grad*/, + model.config.computationMode == COMP_MODE_INFERENCE + ? false + : true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 0f60bfe17b..edf3cdaf07 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -451,8 +451,10 @@ void peft_bwd_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); - // update input_grad_ptr offset - input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens; + // update input_grad_ptr and output_grad_ptr offset + input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens * in_dim; + output_grad_ptr = + static_cast
(output_grad_ptr) + num_infr_tokens * out_dim; #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu new file mode 100644 index 0000000000..94b62bb399 --- /dev/null +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -0,0 +1,373 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) + : OpMeta(handler, li) {} + +LoraLinearMeta::~LoraLinearMeta(void) {} + +namespace Kernels { +namespace LoraLinear { + +void inference_kernel_wrapper(LoraLinearMeta *m, + void const *input_ptr, + void *output_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + input_ptr, + output_ptr, + weight_first_ptr, + weight_second_ptr, + in_dim, + out_dim, + rank, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::inference_kernel(m, + input_ptr, + output_ptr, + weight_first_ptr, + weight_second_ptr, + in_dim, + out_dim, + rank, + num_infr_tokens, + num_peft_tokens, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + void *input_grad_ptr, + void const *output_grad_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + void *weight_first_grad_ptr, + void *weight_second_grad_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_first_ptr, + weight_second_ptr, + weight_first_grad_ptr, + weight_second_grad_ptr, + in_dim, + out_dim, + rank, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_first_ptr, + weight_second_ptr, + weight_first_grad_ptr, + weight_second_grad_ptr, + in_dim, + out_dim, + rank, + num_infr_tokens, + num_peft_tokens, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +namespace Internal { + +template +void inference_kernel(LoraLinearMeta *m, + void const *input_ptr, + void *output_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + assert(m->weight_type[1] == weight_type); + cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); + cudaDataType_t lr_actv_type = output_type; + assert(input_type == weight_type && weight_type == output_type); + // adjust input_ptr and output_ptr offset + // TODO: we currently assume that all inference tokens do not use LoRA + input_ptr = static_cast
(input_ptr) + num_infr_tokens * in_dim; + output_ptr = static_cast
(output_ptr) + num_infr_tokens * out_dim; + +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = input_type; +#endif + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + m->low_rank_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[1]) * num_peft_tokens * rank); + // copy input activation + checkCUDA(cudaMemcpyAsync(m->input_activation, + input_ptr, + data_type_size(m->input_type[0]) * num_peft_tokens * + in_dim, + cudaMemcpyDeviceToDevice, + stream)); + // buffer = weight_first * input + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + rank, + num_peft_tokens, + in_dim, + &alpha, + weight_first_ptr, + weight_type, + in_dim, + input_ptr, + input_type, + in_dim, + &beta, + m->low_rank_activation, + lr_actv_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // output = weight_second * buffer + // Note that we use alpha in both places since we do + // an in-place update for LoraLinear + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_dim, + num_peft_tokens, + rank, + &alpha, + weight_second_ptr, + weight_type, + rank, + m->low_rank_activation, + lr_actv_type, + rank, + &alpha, + output_ptr, + output_type, + out_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} + +template +void peft_bwd_kernel(LoraLinearMeta *m, + void *input_grad_ptr, + void const *output_grad_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + void *weight_first_grad_ptr, + void *weight_second_grad_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f; + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + assert(weight_type == ff_to_cuda_datatype(m->weight_type[1])); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + cudaDataType_t lr_actv_type = output_type; + // update input_grad_ptr and output_grad_ptr offset + input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens * in_dim; + output_grad_ptr = + static_cast
(output_grad_ptr) + num_infr_tokens * out_dim; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; +#endif + // Compute weight_second gradiant + // NOTE: we use alpha=1 for weight_second_grad to accumulate gradients + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + rank, + out_dim, + num_peft_tokens, + &alpha, + m->low_rank_activation, + lr_actv_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &alpha, + weight_second_grad_ptr, + weight_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Compute gradiants w.r.t. low_rank activation + // and save the results to low_rank_activation + // NOTE: we use alpha=1 for input_grad to accumulate gradients + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + rank, + num_peft_tokens, + out_dim, + &alpha, + weight_second_ptr, + weight_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &alpha, + m->low_rank_activation, + lr_actv_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Compute weight_first gradiant + // NOTE: we use alpha=1 for kernel_grad to accumulate gradients + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + in_dim, + rank, + num_peft_tokens, + &alpha, + m->input_activation, + input_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &alpha, + weight_first_grad_ptr, + weight_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Compute input gradiant + // NOTE: we use alpha=1 for input_grad to accumulate gradients + if (input_grad_ptr != nullptr) { + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + rank, + &alpha, + weight_first_ptr, + weight_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &alpha, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } +} + +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc new file mode 100644 index 0000000000..e9da54b04b --- /dev/null +++ b/src/ops/lora_linear.cc @@ -0,0 +1,599 @@ +#include "flexflow/ops/lora_linear.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/layer.h" +#include "flexflow/model.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/utils/hash_utils.h" +#include "legion/legion_utilities.h" + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::LoraLinear; + +void FFModel::lora_linear(Tensor const input, + Tensor const output, + int rank, + DataType data_type, + Initializer *kernel_initializer, + char const *name) { + if (data_type == DT_NONE) { + data_type = input->data_type; + } + Layer *li = nullptr; + if (data_type != input->data_type) { + Tensor casted_input = cast(input, data_type, "type cast for dense"); + li = new Layer(this, + OP_LORA_LINEAR, + data_type, + name, + 2 /*inputs*/, + 2 /*weights*/, + 0 /*outputs*/, + casted_input); + } else { + li = new Layer(this, + OP_LORA_LINEAR, + data_type, + name, + 2 /*inputs*/, + 2 /*weights*/, + 0 /*outputs*/, + input); + } + { + int dims[2] = {input->dims[0], rank}; + li->weights[0] = create_weight_legion_ordering(2, + dims, + data_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + { + int dims[2] = {rank, output->dims[0]}; + li->weights[1] = create_weight_legion_ordering(2, + dims, + data_type, + li, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + li->add_int_property("rank", rank); + layers.push_back(li); +} + +Op *LoraLinear::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + long long value; + layer->get_int_property("rank", value); + int rank = (int)value; + return new LoraLinear(model, + layer->layer_guid, + inputs[0], + inputs[1], + rank, + layer->data_type, + false /*allocate_weights*/, + layer->name); +} + +LoraLinear::LoraLinear(FFModel &model, + LoraLinear const &other, + ParallelTensor const input, + ParallelTensor const output, + bool allocate_weights) + : LoraLinear(model, + other.layer_guid, + input, + output, + other.rank, + other.data_type, + allocate_weights, + other.name) {} + +LoraLinear::LoraLinear(FFModel &model, + Params const ¶ms, + Input const &inputs, + bool allocate_weights, + char const *name) + : LoraLinear(model, + params.layer_guid, + inputs.first, + inputs.second, + params.rank, + params.data_type, + allocate_weights, + name) {} + +LoraLinear::LoraLinear(FFModel &model, + LayerID const &_layer_guid, + ParallelTensor const _input, + ParallelTensor const _output, + int _rank, + DataType _data_type, + bool allocate_weights, + char const *name) + : Op(model, + OP_LORA_LINEAR, + _data_type, + name, + 2 /*inputs*/, + 2 /*weights*/, + allocate_weights, + 0 /*outputs*/, + _input, + _output), + rank(_rank) { + // overwrite layer_guid + layer_guid = _layer_guid; + data_type = _data_type; + + ParallelTensorShape input_shape = this->inputs[0]->get_shape(); + LoraLinearParams params = this->get_params(); + + if (allocate_weights) { + Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); + // create weight first + { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[1] = inputs[0]->dims[num_dims - 1]; // data parallel + dims[1].size = dims[1].degree; + dims[1].is_replica_dim = true; + dims[0] = inputs[0]->dims[0]; + dims[0].size = inputs[0]->dims[0].size * rank; + weights[0] = + model.create_parallel_weight_legion_ordering(2, + dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + // create weight second + { + ParallelDim dims[2]; + int num_dims = inputs[0]->num_dims; + dims[1] = inputs[0]->dims[0]; + dims[1].size = dims[1].degree; + dims[1].is_replica_dim = true; + dims[0] = inputs[1]->dims[0]; + dims[0].size = inputs[1]->dims[0].size * rank; + weights[1] = + model.create_parallel_weight_legion_ordering(2, + dims, + this->data_type, + nullptr /*owner_op*/, + true /*create_grad*/, + kernel_initializer, + CHOSEN_SYNC_TYPE); + } + } + + // assert(check_output_input_weight_parallel_dims(allocate_weights)); +} + +void LoraLinear::init(FFModel const &ff) { + assert(false && "LoraLinear does not support normal init"); +} + +void LoraLinear::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 0); + // assert(check_output_input_weight_same_machine_view()); + // output is considered as an input to allow in-place optimization + ParallelTensor output_tensor = batch_inputs[1]; + parallel_is = output_tensor->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, output_tensor); + IndexLauncher launcher(LORA_LINEAR_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(LoraLinear)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, output_tensor); +} + +/* + regions[0](O): output + regions[1](I): kernel + regions[2](I): bias +*/ +OpMeta *LoraLinear::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LoraLinear const *lora = (LoraLinear *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(lora->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorRW(lora->inputs[1]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW weight_first = + helperGetGenericTensorAccessorRW(lora->weights[0]->data_type, + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW weight_second = + helperGetGenericTensorAccessorRW(lora->weights[1]->data_type, + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + int rank = lora->rank; + int batch_size = output.domain.get_volume() / out_dim; + assert(input.domain.get_volume() == in_dim * batch_size); + assert(weight_first.domain.get_volume() == in_dim * rank); + assert(weight_second.domain.get_volume() == out_dim * rank); + + LoraLinearMeta *m = new LoraLinearMeta(handle, lora); + m->trainable_inputs[0] = lora->trainable_inputs[0]; + std::strcpy(m->op_name, lora->name); + + return m; +} + +void LoraLinear::forward(FFModel const &ff) { + assert(false && "LoraLinear does not support normal init"); +} + +FutureMap + LoraLinear::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 0); + // assert(check_output_input_weight_same_machine_view()); + // output is considered as an input to allow in-place optimization + ParallelTensor output_tensor = batch_inputs[1]; + parallel_is = output_tensor->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_inference(ff, argmap, output_tensor); + IndexLauncher launcher(LORA_LINEAR_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void LoraLinear::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(regions.size() == 4); + assert(task->regions.size() == regions.size()); + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + int rank = weight_first.domain.get_volume() / in_dim; + assert(in_dim * rank == weight_first.domain.get_volume()); + assert(out_dim * rank == weight_second.domain.get_volume()); + + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + inference_kernel_wrapper(m, + input.ptr, + output.ptr, + weight_first.ptr, + weight_second.ptr, + in_dim, + out_dim, + rank, + num_infr_tokens, + num_peft_tokens); +} + +FutureMap LoraLinear::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + ParallelTensor output_tensor = batch_inputs[1]; + parallel_is = output_tensor->parallel_is; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + set_argumentmap_for_inference(ff, argmap, output_tensor); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(3, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(4, FID_DATA); + launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(5, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void LoraLinear::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + assert(regions.size() == 6); + assert(task->regions.size() == regions.size()); + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_first_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_second_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime); + + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + int rank = weight_first.domain.get_volume() / in_dim; + assert(in_dim * rank == weight_first.domain.get_volume()); + assert(out_dim * rank == weight_second.domain.get_volume()); + assert(weight_first.domain == weight_first_grad.domain); + assert(weight_second.domain == weight_second_grad.domain); + + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + peft_bwd_kernel_wrapper(m, + input_grad.ptr, + output_grad.ptr, + weight_first.ptr, + weight_second.ptr, + weight_first_grad.ptr, + weight_second_grad.ptr, + in_dim, + out_dim, + rank, + num_infr_tokens, + num_peft_tokens); +} + +void LoraLinear::backward(FFModel const &ff) { + assert(false && "LoraLinear does not support normal backward"); +} + +void LoraLinear::print_layer(FFModel const &ff) {} + +bool LoraLinear::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { + return lhs.layer_guid == rhs.layer_guid && lhs.rank == rhs.rank && + lhs.data_type == rhs.data_type; +} + +void LoraLinear::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->rank); + sez.serialize(this->data_type); +} + +/* static */ +using PCG::Node; +Node LoraLinear::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + int rank; + DataType data_type; + size_t id, transformer_layer_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(rank); + dez.deserialize(data_type); + + LoraLinearParams params; + params.rank = rank; + params.data_type = data_type; + params.layer_guid = layer_guid; + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} + +LoraLinearParams LoraLinear::get_params() const { + LoraLinearParams params; + params.layer_guid = this->layer_guid; + params.rank = this->rank; + params.data_type = this->data_type; + return params; +} + +bool LoraLinearParams::is_valid( + std::pair const &input_shape) + const { + return true; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::LoraLinearParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.rank); + hash_combine(key, params.data_type); + return key; +} +}; // namespace std diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 6eb2c163ce..33567832f5 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -69,6 +69,10 @@ int BatchConfig::num_active_requests() const { return num_requests; } +int BatchConfig::num_active_tokens() const { + return num_tokens; +} + int BatchConfig::num_active_infr_tokens() const { return num_tokens; } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 263405f8ab..c77c4d2432 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -47,6 +47,7 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" @@ -6211,6 +6212,54 @@ void register_flexflow_internal_tasks(Runtime *runtime, TreeIncMultiHeadSelfAttention::inference_task>(registrar); } } + // PEFT tasks + // LoraLinear tasks + { + TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID, + "LoraLinear Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(LORA_LINEAR_PEFT_BWD_TASK_ID, + "LoraLinear PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); From 66230bd1d6d50f9094d97ea31892df6f4ffa6ca8 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 3 Oct 2023 17:58:03 -0400 Subject: [PATCH 004/198] add missing files --- .../ops/kernels/lora_linear_kernels.h | 80 +++++++++++++++++++ include/flexflow/ops/lora_linear_params.h | 32 ++++++++ 2 files changed, 112 insertions(+) create mode 100644 include/flexflow/ops/kernels/lora_linear_kernels.h create mode 100644 include/flexflow/ops/lora_linear_params.h diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h new file mode 100644 index 0000000000..520030ece5 --- /dev/null +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -0,0 +1,80 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H + +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/ops/lora_linear.h" + +namespace FlexFlow { + +class LoraLinearMeta : public OpMeta { +public: + LoraLinearMeta(FFHandler handle, + LoraLinear const *li); + ~LoraLinearMeta(void); + char op_name[MAX_OPNAME]; + // PEFT related fields + void *low_rank_activation; + void *input_activation; +}; + +namespace Kernels { +namespace LoraLinear { +void inference_kernel_wrapper(LoraLinearMeta *m, + void const *input_ptr, + void *output_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens); +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + void *input_grad_ptr, + void const *output_grad_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + void *weight_first_grad_ptr, + void *weight_second_grad_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens); +bool use_activation(ActiMode mode); + +namespace Internal { +template +void inference_kernel(LoraLinearMeta *m, + void const *input_ptr, + void *output_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream); +template +void peft_bwd_kernel(LoraLinearMeta *m, + void *input_grad_ptr, + void const *output_grad_ptr, + void const *weight_first_ptr, + void const *weight_second_ptr, + void *weight_first_grad_ptr, + void *weight_second_grad_ptr, + int in_dim, + int out_dim, + int rank, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream); +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h new file mode 100644 index 0000000000..545b39d8de --- /dev/null +++ b/include/flexflow/ops/lora_linear_params.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LORA_LINEAR_PARAMS_H +#define _FLEXFLOW_LORA_LINEAR_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_tensor.h" + +namespace FlexFlow { + +class LoraLinearParams { +public: + LayerID layer_guid; + int rank; + DataType data_type; + + bool is_valid( + std::pair const &input_shape) const; + friend bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs); +}; + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::LoraLinearParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H From f0d1155a6334b4a6babb5dbe9c8d65a208c10978 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 3 Oct 2023 18:02:38 -0400 Subject: [PATCH 005/198] format --- include/flexflow/ops/kernels/lora_linear_kernels.h | 3 +-- include/flexflow/ops/lora_linear_params.h | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 520030ece5..753167c9c4 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -10,8 +10,7 @@ namespace FlexFlow { class LoraLinearMeta : public OpMeta { public: - LoraLinearMeta(FFHandler handle, - LoraLinear const *li); + LoraLinearMeta(FFHandler handle, LoraLinear const *li); ~LoraLinearMeta(void); char op_name[MAX_OPNAME]; // PEFT related fields diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 545b39d8de..a19a2ff298 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -15,9 +15,10 @@ class LoraLinearParams { int rank; DataType data_type; - bool is_valid( - std::pair const &input_shape) const; - friend bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs); + bool is_valid(std::pair const + &input_shape) const; + friend bool operator==(LoraLinearParams const &lhs, + LoraLinearParams const &rhs); }; } // namespace FlexFlow From fb203cced48365db63226444c2b3270e5d70a4c2 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 3 Oct 2023 20:55:56 -0400 Subject: [PATCH 006/198] LoraLinear now takes two inputs and generates one output --- include/flexflow/ops/lora_linear.h | 5 +- inference/file_loader.cc | 4 ++ inference/models/llama.cc | 2 + src/ops/lora_linear.cc | 93 ++++++++++++++++++++---------- src/runtime/ffconst_utils.cc | 3 + src/runtime/graph.cc | 6 ++ src/runtime/model.cc | 6 ++ 7 files changed, 89 insertions(+), 30 deletions(-) diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index f60ee4c17b..fff3927ff1 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -54,6 +54,7 @@ class LoraLinear : public Op { std::vector const &, MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; + void map_output_tensors(FFModel &model) override; static Op * create_operator_from_layer(FFModel &model, Layer const *layer, @@ -86,7 +87,9 @@ class LoraLinear : public Op { Legion::Deserializer &d, ParallelTensor inputs[], int num_inputs); - + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; // size_t get_params_hash() const override; LoraLinearParams get_params() const; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index dc724319d2..f11df920e3 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -764,6 +764,10 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { if (weight == NULL) { continue; } + // TODO: currently skip Lora layers + if (l->op_type == OP_LORA_LINEAR) { + continue; + } switch (weight->data_type) { case DT_HALF: load_single_weight_tensor(ff, l, i); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index b8fe70526d..da8fc4ee63 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -219,6 +219,8 @@ void LLAMA::create_llama_model(FFModel &ff, 0.0f, std::string("layers_" + std::to_string(i) + "_feed_forward_w2") .c_str()); + // Low-Rank Adapter (LoRA) for the second linear layer + ff.lora_linear(multi, w2, 16 /*rank*/); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index e9da54b04b..bbfa120886 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -39,26 +39,26 @@ void FFModel::lora_linear(Tensor const input, if (data_type == DT_NONE) { data_type = input->data_type; } + assert(data_type == input->data_type); + assert(data_type == output->data_type); Layer *li = nullptr; - if (data_type != input->data_type) { - Tensor casted_input = cast(input, data_type, "type cast for dense"); - li = new Layer(this, - OP_LORA_LINEAR, - data_type, - name, - 2 /*inputs*/, - 2 /*weights*/, - 0 /*outputs*/, - casted_input); - } else { - li = new Layer(this, - OP_LORA_LINEAR, - data_type, - name, - 2 /*inputs*/, - 2 /*weights*/, - 0 /*outputs*/, - input); + li = new Layer(this, + OP_LORA_LINEAR, + data_type, + name, + 2 /*inputs*/, + 2 /*weights*/, + 1 /*outputs*/, + input, + output); + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + li->outputs[0] = create_tensor_legion_ordering( + numdims, dims, data_type, li, 0, true /*create_grad*/); } { int dims[2] = {input->dims[0], rank}; @@ -144,7 +144,7 @@ LoraLinear::LoraLinear(FFModel &model, 2 /*inputs*/, 2 /*weights*/, allocate_weights, - 0 /*outputs*/, + 1 /*outputs*/, _input, _output), rank(_rank) { @@ -194,7 +194,16 @@ LoraLinear::LoraLinear(FFModel &model, CHOSEN_SYNC_TYPE); } } - + // Create output tensor + { + int numdim = inputs[1]->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = inputs[1]->dims[i]; + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, inputs[1]->data_type, this); + } // assert(check_output_input_weight_parallel_dims(allocate_weights)); } @@ -209,10 +218,12 @@ void LoraLinear::init_inference( MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); assert(batch_inputs.size() == 2); - assert(batch_outputs.size() == 0); + assert(batch_outputs.size() == 1); + // Assert that the output is the same as the second input + assert(batch_outputs[0] == batch_inputs[1]); // assert(check_output_input_weight_same_machine_view()); // output is considered as an input to allow in-place optimization - ParallelTensor output_tensor = batch_inputs[1]; + ParallelTensor output_tensor = batch_outputs[0]; parallel_is = output_tensor->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -276,7 +287,7 @@ OpMeta *LoraLinear::init_task(Task const *task, ctx, runtime); GenericTensorAccessorW output = - helperGetGenericTensorAccessorRW(lora->inputs[1]->data_type, + helperGetGenericTensorAccessorRW(lora->outputs[0]->data_type, regions[1], task->regions[1], FID_DATA, @@ -323,10 +334,12 @@ FutureMap MachineView const *mv) { assert(check_output_input_weight_same_parallel_is()); assert(batch_inputs.size() == 2); - assert(batch_outputs.size() == 0); + assert(batch_outputs.size() == 1); + // Assert that the output is the same as the second input + assert(batch_outputs[0] == batch_inputs[1]); // assert(check_output_input_weight_same_machine_view()); // output is considered as an input to allow in-place optimization - ParallelTensor output_tensor = batch_inputs[1]; + ParallelTensor output_tensor = batch_outputs[0]; parallel_is = output_tensor->parallel_is; ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -416,10 +429,14 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, MachineView const *mv) { + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output is the same as the second input + assert(batch_outputs[0] == batch_inputs[1]); ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - ParallelTensor output_tensor = batch_inputs[1]; + ParallelTensor output_tensor = batch_outputs[0]; parallel_is = output_tensor->parallel_is; MachineView const *view = mv ? mv : &output_tensor->machine_view; set_argumentmap_for_inference(ff, argmap, output_tensor); @@ -494,11 +511,11 @@ void LoraLinear::peft_bwd_task(Task const *task, GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + m->weight_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); GenericTensorAccessorW weight_first_grad = helperGetGenericTensorAccessorRW( m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); GenericTensorAccessorW weight_second_grad = helperGetGenericTensorAccessorRW( - m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime); + m->weight_type[1], regions[5], task->regions[5], FID_DATA, ctx, runtime); int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; @@ -530,6 +547,17 @@ void LoraLinear::backward(FFModel const &ff) { void LoraLinear::print_layer(FFModel const &ff) {} +void LoraLinear::map_output_tensors(FFModel &ff) { + assert(numOutputs == 1); + assert(numInputs == 2); + assert(outputs[0]->get_volume() == inputs[1]->get_volume()); + outputs[0]->parallel_is = inputs[1]->parallel_is; + outputs[0]->region = inputs[1]->region; + outputs[0]->part = inputs[1]->part; + outputs[0]->region_grad = inputs[1]->region_grad; + outputs[0]->part_grad = inputs[1]->part_grad; +} + bool LoraLinear::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { @@ -571,6 +599,13 @@ Node LoraLinear::deserialize(FFModel &ff, return ff.get_or_create_node({inputs[0], inputs[1]}, params); } +Op *LoraLinear::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + LoraLinearParams params = get_params(); + return new LoraLinear(ff, params, {inputs[0], inputs[1]}, this->name); +} + LoraLinearParams LoraLinear::get_params() const { LoraLinearParams params; params.layer_guid = this->layer_guid; diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257a..47abcacd6a 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -188,6 +188,9 @@ std::string get_operator_type_name(OperatorType type) { return "Sampling"; case OP_ARGMAX: return "ArgMax"; + // PEFT Ops + case OP_LORA_LINEAR: + return "LoraLinear"; // Parallel Ops case OP_REPARTITION: return "Repartition"; diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 408de57c54..2ed57cd21e 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -36,6 +36,7 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" @@ -1995,6 +1996,7 @@ std::pair, std::unordered_map> mv.device_type = MachineView::GPU; mv.ndims = 1; int total_parallel_degree = 1; + assert(op->numOutputs > 0); for (int i = 0; i < op->outputs[0]->num_dims; i++) { total_parallel_degree *= op->outputs[0]->dims[i].degree; } @@ -2722,6 +2724,10 @@ void FFModel::deserialize_graph_optimal_view( node = Linear::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_LORA_LINEAR: { + node = LoraLinear::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_MULTIHEAD_ATTENTION: { assert(num_inputs == 3); int embed_dim, num_heads, k_dim, v_dim; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c77c4d2432..2735513af2 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3215,6 +3215,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + // PEFT layers + case OP_LORA_LINEAR: { + Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } default: assert(false); } From c3d9c3801fcd6dcfce73b151e70da8cb31378f6a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 3 Oct 2023 23:56:53 -0400 Subject: [PATCH 007/198] LoRA forward pass works now --- .../ops/kernels/lora_linear_kernels.h | 6 +-- src/ops/fused.cc | 3 +- src/ops/fused.cu | 41 ++++++++++++++++++- src/ops/kernels/lora_linear_kernels.cu | 2 +- src/ops/lora_linear.cc | 22 ++++++---- src/runtime/inference_manager.cc | 23 ++++++++++- 6 files changed, 82 insertions(+), 15 deletions(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 753167c9c4..1ba7347f5e 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H -#define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H +#ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H #include "flexflow/device.h" #include "flexflow/fftype.h" @@ -76,4 +76,4 @@ void peft_bwd_kernel(LoraLinearMeta *m, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H +#endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 1d5db2f461..70650aef0d 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -361,8 +361,9 @@ void FusedOp::init_inference(FFModel const &ff, } } for (int i = 0; i < op_num_outputs[op]; i++) { + int my_off = op_output_idx[i + ooff]; assert(op_output_source[i + ooff] == SOURCE_OUTPUT); - my_batch_outputs.push_back(batch_outputs[i + ooff]); + my_batch_outputs.push_back(batch_outputs[my_off]); } ioff += op_num_inputs[op]; ooff += op_num_outputs[op]; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index f291ecfd67..ef9dc5d5c6 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -30,6 +30,7 @@ #include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/residual_rms_norm_kernels.h" @@ -634,10 +635,11 @@ __host__ void my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -700,6 +702,43 @@ __host__ void batch_size); break; } + case OP_LORA_LINEAR: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_accessor[0].domain; + Domain output_domain = my_output_accessor[0].domain; + Domain weight_first_domain = my_weight_accessor[0].domain; + Domain weight_second_domain = my_weight_accessor[1].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int rank = weight_first_domain.get_volume() / in_dim; + assert(in_dim * rank == weight_first_domain.get_volume()); + assert(out_dim * rank == weight_second_domain.get_volume()); + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == 2); + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->output_type[0] == my_output_accessor[0].data_type); + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); + Kernels::LoraLinear::inference_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + my_weight_accessor[1].ptr, + in_dim, + out_dim, + rank, + num_infr_tokens, + num_peft_tokens); + break; + } case OP_BATCHMATMUL: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 94b62bb399..282134817e 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -175,7 +175,7 @@ void inference_kernel(LoraLinearMeta *m, DT alpha = 1.0f, beta = 0.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); - assert(m->weight_type[1] == weight_type); + assert(m->weight_type[1] == m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); cudaDataType_t lr_actv_type = output_type; assert(input_type == weight_type && weight_type == output_type); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index bbfa120886..43d1b4cef1 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -219,8 +219,10 @@ void LoraLinear::init_inference( assert(check_output_input_weight_same_parallel_is()); assert(batch_inputs.size() == 2); assert(batch_outputs.size() == 1); - // Assert that the output is the same as the second input - assert(batch_outputs[0] == batch_inputs[1]); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); // assert(check_output_input_weight_same_machine_view()); // output is considered as an input to allow in-place optimization ParallelTensor output_tensor = batch_outputs[0]; @@ -253,13 +255,13 @@ void LoraLinear::init_inference( launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, - READ_ONLY, + WRITE_ONLY, EXCLUSIVE, weights[0]->region)); launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, - READ_ONLY, + WRITE_ONLY, EXCLUSIVE, weights[1]->region)); launcher.add_field(3, FID_DATA); @@ -335,8 +337,10 @@ FutureMap assert(check_output_input_weight_same_parallel_is()); assert(batch_inputs.size() == 2); assert(batch_outputs.size() == 1); - // Assert that the output is the same as the second input - assert(batch_outputs[0] == batch_inputs[1]); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); // assert(check_output_input_weight_same_machine_view()); // output is considered as an input to allow in-place optimization ParallelTensor output_tensor = batch_outputs[0]; @@ -431,8 +435,10 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, MachineView const *mv) { assert(batch_inputs.size() == 2); assert(batch_outputs.size() == 1); - // Assert that the output is the same as the second input - assert(batch_outputs[0] == batch_inputs[1]); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 461873d798..199b94c72c 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -142,7 +142,28 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { for (int i = 0; i < op->numOutputs; i++) { ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); - + // no need to map inplace tensor + // A tensor is inplace if it shares the same region as another tensor + { + bool inplace = false; + for (int j = 0; j < op->numInputs; j++) { + if (op->inputs[j]->region == op->outputs[i]->region) { + assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); + tensor_buffer[pt_base] = tensor_buffer[op->inputs[j]]; + inplace = true; + } + } + for (int j = 0; j < i; j++) { + if (op->outputs[j]->region == op->outputs[i]->region) { + assert(tensor_buffer.find(op->outputs[j]) != tensor_buffer.end()); + tensor_buffer[pt_base] = tensor_buffer[op->outputs[j]]; + inplace = true; + } + } + if (inplace) { + continue; + } + } if (op->op_type == OP_REPLICATE) { assert(op->numInputs == 1 && op->numOutputs == 1); } From c4cfcc37f1d8f7da462076889c8f24749fbca43e Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 7 Oct 2023 16:02:03 -0400 Subject: [PATCH 008/198] [LoraLinear] update to allocate weight through per-GPU PEFTWeightAllocator --- include/flexflow/batch_config.h | 12 + include/flexflow/config.h | 2 + include/flexflow/ffconst.h | 2 + include/flexflow/fftype.h | 22 + include/flexflow/model.h | 4 +- .../ops/kernels/lora_linear_kernels.h | 58 +-- include/flexflow/ops/lora_linear.h | 36 +- include/flexflow/ops/lora_linear_params.h | 2 - .../flexflow/utils/peft_weight_allocator.h | 92 ++++ inference/models/llama.cc | 2 +- src/ops/fused.cu | 12 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/kernels/lora_linear_kernels.cu | 418 +++++++++--------- src/ops/lora_linear.cc | 384 ++++++---------- src/runtime/fftype.cc | 16 + src/runtime/model.cc | 16 + 16 files changed, 547 insertions(+), 533 deletions(-) create mode 100644 include/flexflow/utils/peft_weight_allocator.h diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index fc243fb365..b26b9ef823 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -16,6 +16,7 @@ #pragma once #include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "legion.h" #include #include @@ -62,10 +63,21 @@ class BatchConfig { bool loading_prompt = false; struct PerRequestInfo { + PerRequestInfo() { + token_start_offset = 0; + num_tokens_in_batch = 0; + max_sequence_length = 0; + request_guid = 0; + peft_model_id = PEFTModelID::NO_ID; + peft_bwd = false; + } int token_start_offset; int num_tokens_in_batch; int max_sequence_length; RequestGuid request_guid; + // PEFT fields + PEFTModelID peft_model_id; + bool peft_bwd; }; struct PerTokenInfo { int abs_depth_in_request; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 1d74a38468..60d1cb17d2 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -65,6 +65,7 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS; class FFConfig; class MemoryAllocator; +class PEFTWeightAllocator; struct FFHandler { #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -82,6 +83,7 @@ struct FFHandler { void *peft_activation_reserve_space; size_t peft_activation_reserve_space_size; MemoryAllocator *peft_activation_allocator; + PEFTWeightAllocator *peft_weight_allocator; // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 37a178d952..efc37ce78d 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -271,5 +271,7 @@ enum { TENSOR_GUID_LAST_VALID = 3999999, PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000, NODE_GUID_FIRST_VALID = 5000000, + PEFT_MODEL_ID_FIRST_VALID = 6000000, + PEFT_MODEL_ID_LAST_VALID = 6999999 }; #endif // _FLEXFLOW_CONST_H_ diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 18ed6b8100..665de43c59 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -3,6 +3,7 @@ #include "flexflow/ffconst.h" #include +#include namespace FlexFlow { @@ -18,6 +19,27 @@ class LayerID { size_t id, transformer_layer_id; }; +class PEFTModelID { +public: + static const PEFTModelID NO_ID; + PEFTModelID(); + PEFTModelID(size_t id); + bool is_valid_id() const; + friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); + +public: + size_t id; +}; + }; // namespace FlexFlow +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::PEFTModelID const &n) const { + return n.id; + } +}; +} // namespace std + #endif // _FF_TYPE_H diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 105c678ba9..cc8d2267cf 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -121,6 +121,7 @@ enum TaskIDs { LINEAR_BWD2_TASK_ID, LINEAR_UPD_TASK_ID, LORA_LINEAR_INIT_TASK_ID, + LORA_LINEAR_REG_TASK_ID, LORA_LINEAR_INF_TASK_ID, LORA_LINEAR_PEFT_BWD_TASK_ID, FLAT_INIT_TASK_ID, @@ -809,9 +810,6 @@ class FFModel { // ======================================== void lora_linear(Tensor const input, Tensor const output, - int rank, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = nullptr, char const *name = nullptr); // ======================================== // Inference APIs diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 1ba7347f5e..32a6832e2e 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H +#include "flexflow/accessor.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -8,6 +9,12 @@ namespace FlexFlow { +struct LoraLinearWeight { + void *w0_ptr, *w1_ptr, *w0_grad_ptr, *w1_grad_ptr; + void *w0_state_ptr, *w1_state_ptr; + int rank; +}; + class LoraLinearMeta : public OpMeta { public: LoraLinearMeta(FFHandler handle, LoraLinear const *li); @@ -16,64 +23,39 @@ class LoraLinearMeta : public OpMeta { // PEFT related fields void *low_rank_activation; void *input_activation; + std::unordered_map model_weights; }; namespace Kernels { namespace LoraLinear { void inference_kernel_wrapper(LoraLinearMeta *m, - void const *input_ptr, - void *output_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, - int in_dim, - int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens); + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); void peft_bwd_kernel_wrapper(LoraLinearMeta *m, - void *input_grad_ptr, - void const *output_grad_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, - void *weight_first_grad_ptr, - void *weight_second_grad_ptr, - int in_dim, - int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens); -bool use_activation(ActiMode mode); + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); namespace Internal { template void inference_kernel(LoraLinearMeta *m, - void const *input_ptr, - void *output_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, int in_dim, int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens, ffStream_t stream); template void peft_bwd_kernel(LoraLinearMeta *m, - void *input_grad_ptr, - void const *output_grad_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, - void *weight_first_grad_ptr, - void *weight_second_grad_ptr, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, int in_dim, int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens, ffStream_t stream); } // namespace Internal } // namespace LoraLinear } // namespace Kernels } // namespace FlexFlow - #endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index fff3927ff1..39d8925262 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -21,19 +21,14 @@ class LoraLinear : public Op { LayerID const &layer_guid, ParallelTensor const input, ParallelTensor const output, - int rank, - DataType _data_type, - bool allocate_weights, - char const *name); + char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, - ParallelTensor const output, - bool allocate_weights); + ParallelTensor const output); LoraLinear(FFModel &model, Params const ¶ms, Input const &inputs, - bool allocate_weights = false, char const *name = nullptr); void init(FFModel const &) override; @@ -43,6 +38,12 @@ class LoraLinear : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + void register_peft_model(FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv, + PEFTModelID const &model_id, + int rank); Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, @@ -63,6 +64,11 @@ class LoraLinear : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + register_model_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void inference_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -92,22 +98,6 @@ class LoraLinear : public Op { int num_inputs) const override; // size_t get_params_hash() const override; LoraLinearParams get_params() const; - -private: - LoraLinear(int guid, - bool profiling, - ParallelTensor const input, - ParallelTensor const output, - int rank, - bool allocate_weights, - char const *name); - - void register_mappings(); - void register_output_mappings(); - void register_weight_mappings(); - -public: - int rank; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index a19a2ff298..9eaee3000b 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -12,8 +12,6 @@ namespace FlexFlow { class LoraLinearParams { public: LayerID layer_guid; - int rank; - DataType data_type; bool is_valid(std::pair const &input_shape) const; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h new file mode 100644 index 0000000000..dae46a8af1 --- /dev/null +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -0,0 +1,92 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ +#define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ + +#include "flexflow/config.h" +#include + +namespace FlexFlow { + +class PEFTWeightAllocator { +public: + PEFTWeightAllocator(void *_base_ptr, size_t _total_size) + : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0), + local_offset(_total_size) {} + + inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id, + size_t datalen) { + const std::lock_guard lock(peft_weight_allocator_mutex); + void *ptr = static_cast(base_ptr) + sync_offset; + off_t model_sync_weights_offset = sync_offset; + size_t model_sync_weights_size = datalen; + if (sync_weights.find(peft_model_id) != sync_weights.end()) { + // Assert that sync weights for each PEFT model is consecutive + std::pair offset_and_size = sync_weights[peft_model_id]; + assert(sync_offset == offset_and_size.first + offset_and_size.second); + model_sync_weights_offset = offset_and_size.first; + model_sync_weights_size = offset_and_size.second + datalen; + } + sync_offset += datalen; + assert(sync_offset < local_offset); + sync_weights[peft_model_id] = + std::make_pair(model_sync_weights_offset, model_sync_weights_size); + return ptr; + } + + std::pair + get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) { + const std::lock_guard lock(peft_weight_allocator_mutex); + assert(sync_weights.find(peft_model_id) != sync_weights.end()); + std::pair offset_and_size = sync_weights[peft_model_id]; + return std::make_pair(static_cast(base_ptr) + offset_and_size.first, + offset_and_size.second); + } + + inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id, + size_t datalen) { + const std::lock_guard lock(peft_weight_allocator_mutex); + local_offset -= datalen; + assert(sync_offset < local_offset); + void *ptr = static_cast(base_ptr) + local_offset; + return ptr; + } + + template + inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id, + size_t count) { + return static_cast
( + allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count)); + } + + template + inline DT *allocate_local_weights(PEFTModelID const &peft_model_id, + size_t count) { + return static_cast
( + allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count)); + } + +public: + void *base_ptr; + size_t total_size; + off_t sync_offset, local_offset; + std::unordered_map> sync_weights; + std::mutex peft_weight_allocator_mutex; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ diff --git a/inference/models/llama.cc b/inference/models/llama.cc index da8fc4ee63..f90040170e 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -220,7 +220,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string("layers_" + std::to_string(i) + "_feed_forward_w2") .c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear(multi, w2, 16 /*rank*/); + ff.lora_linear(multi, w2); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index ef9dc5d5c6..f6d8365f1f 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -727,16 +727,8 @@ __host__ void // Assert that the output and the second input are at the same place // since we ``inplace'' the output for LoRA assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); - Kernels::LoraLinear::inference_kernel_wrapper(m, - my_input_accessor[0].ptr, - my_output_accessor[0].ptr, - my_weight_accessor[0].ptr, - my_weight_accessor[1].ptr, - in_dim, - out_dim, - rank, - num_infr_tokens, - num_peft_tokens); + Kernels::LoraLinear::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } case OP_BATCHMATMUL: { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index e0a441ea50..19f3aabb90 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -577,7 +577,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_infr_tokens(); + int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; int q_block_size = m->qProjSize * num_tokens; int kt_block_size = m->kProjSize * BatchConfig::max_sequence_length(); diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 282134817e..1e9069fa72 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -29,18 +29,15 @@ namespace Kernels { namespace LoraLinear { void inference_kernel_wrapper(LoraLinearMeta *m, - void const *input_ptr, - void *output_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, - int in_dim, - int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens) { + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + if (m->profiling) { cudaEventCreate(&t_start); cudaEventCreate(&t_end); @@ -48,27 +45,19 @@ void inference_kernel_wrapper(LoraLinearMeta *m, } if (m->input_type[0] == DT_FLOAT) { Internal::inference_kernel(m, - input_ptr, - output_ptr, - weight_first_ptr, - weight_second_ptr, + bc, + input.get_float_ptr(), + output.get_float_ptr(), in_dim, out_dim, - rank, - num_infr_tokens, - num_peft_tokens, stream); } else if (m->input_type[0] == DT_HALF) { Internal::inference_kernel(m, - input_ptr, - output_ptr, - weight_first_ptr, - weight_second_ptr, + bc, + input.get_half_ptr(), + output.get_half_ptr(), in_dim, out_dim, - rank, - num_infr_tokens, - num_peft_tokens, stream); } @@ -90,17 +79,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m, } void peft_bwd_kernel_wrapper(LoraLinearMeta *m, - void *input_grad_ptr, - void const *output_grad_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, - void *weight_first_grad_ptr, - void *weight_second_grad_ptr, - int in_dim, - int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens) { + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -109,33 +90,23 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; if (m->input_type[0] == DT_FLOAT) { Internal::peft_bwd_kernel(m, - input_grad_ptr, - output_grad_ptr, - weight_first_ptr, - weight_second_ptr, - weight_first_grad_ptr, - weight_second_grad_ptr, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), in_dim, out_dim, - rank, - num_infr_tokens, - num_peft_tokens, stream); } else if (m->input_type[0] == DT_HALF) { Internal::peft_bwd_kernel(m, - input_grad_ptr, - output_grad_ptr, - weight_first_ptr, - weight_second_ptr, - weight_first_grad_ptr, - weight_second_grad_ptr, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), in_dim, out_dim, - rank, - num_infr_tokens, - num_peft_tokens, stream); } @@ -160,15 +131,11 @@ namespace Internal { template void inference_kernel(LoraLinearMeta *m, - void const *input_ptr, - void *output_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, int in_dim, int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -179,10 +146,6 @@ void inference_kernel(LoraLinearMeta *m, cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); cudaDataType_t lr_actv_type = output_type; assert(input_type == weight_type && weight_type == output_type); - // adjust input_ptr and output_ptr offset - // TODO: we currently assume that all inference tokens do not use LoRA - input_ptr = static_cast
(input_ptr) + num_infr_tokens * in_dim; - output_ptr = static_cast
(output_ptr) + num_infr_tokens * out_dim; #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance @@ -190,75 +153,105 @@ void inference_kernel(LoraLinearMeta *m, #else cudaDataType_t compute_type = input_type; #endif - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); - m->low_rank_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[1]) * num_peft_tokens * rank); - // copy input activation - checkCUDA(cudaMemcpyAsync(m->input_activation, - input_ptr, - data_type_size(m->input_type[0]) * num_peft_tokens * - in_dim, - cudaMemcpyDeviceToDevice, - stream)); - // buffer = weight_first * input - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - rank, - num_peft_tokens, - in_dim, - &alpha, - weight_first_ptr, - weight_type, - in_dim, - input_ptr, - input_type, - in_dim, - &beta, - m->low_rank_activation, - lr_actv_type, - rank, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // output = weight_second * buffer - // Note that we use alpha in both places since we do - // an in-place update for LoraLinear - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - out_dim, - num_peft_tokens, - rank, - &alpha, - weight_second_ptr, - weight_type, - rank, - m->low_rank_activation, - lr_actv_type, - rank, - &alpha, - output_ptr, - output_type, - out_dim, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + // Assert that we have at most one request that requires peft_bwd + assert(num_peft_requests <= 1); + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) != + m->model_weights.end()); + LoraLinearWeight weight = + m->model_weights[bc->requestsInfo[i].peft_model_id]; + int rank = weight.rank; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + m->low_rank_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[1]) * num_peft_tokens * rank); + // copy input activation + checkCUDA(cudaMemcpyAsync(m->input_activation, + input_ptr + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } + // buffer = weight_first * input + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + rank, + num_peft_tokens, + in_dim, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + input_ptr + tokens_previous_requests * in_dim, + input_type, + in_dim, + &beta, + m->low_rank_activation, + lr_actv_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // output = weight_second * buffer + // Note that we use alpha in both places since we do + // an in-place update for LoraLinear + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_dim, + num_peft_tokens, + rank, + &alpha, + weight.w1_ptr, + weight_type, + rank, + m->low_rank_activation, + lr_actv_type, + rank, + &alpha, + output_ptr + tokens_previous_requests * out_dim, + output_type, + out_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + tokens_previous_requests += num_peft_tokens; + } + assert(tokens_previous_requests == bc->num_active_tokens()); } template void peft_bwd_kernel(LoraLinearMeta *m, - void *input_grad_ptr, - void const *output_grad_ptr, - void const *weight_first_ptr, - void const *weight_second_ptr, - void *weight_first_grad_ptr, - void *weight_second_grad_ptr, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, int in_dim, int out_dim, - int rank, - int num_infr_tokens, - int num_peft_tokens, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -268,103 +261,124 @@ void peft_bwd_kernel(LoraLinearMeta *m, assert(weight_type == ff_to_cuda_datatype(m->weight_type[1])); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); cudaDataType_t lr_actv_type = output_type; - // update input_grad_ptr and output_grad_ptr offset - input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens * in_dim; - output_grad_ptr = - static_cast
(output_grad_ptr) + num_infr_tokens * out_dim; #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else cudaDataType_t compute_type = CUDA_R_32F; #endif - // Compute weight_second gradiant - // NOTE: we use alpha=1 for weight_second_grad to accumulate gradients - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - rank, - out_dim, - num_peft_tokens, - &alpha, - m->low_rank_activation, - lr_actv_type, - rank, - output_grad_ptr, - output_type, - out_dim, - &alpha, - weight_second_grad_ptr, - weight_type, - rank, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Compute gradiants w.r.t. low_rank activation - // and save the results to low_rank_activation - // NOTE: we use alpha=1 for input_grad to accumulate gradients - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_N, - rank, - num_peft_tokens, - out_dim, - &alpha, - weight_second_ptr, - weight_type, - rank, - output_grad_ptr, - output_type, - out_dim, - &alpha, - m->low_rank_activation, - lr_actv_type, - rank, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Compute weight_first gradiant - // NOTE: we use alpha=1 for kernel_grad to accumulate gradients - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - in_dim, - rank, - num_peft_tokens, - &alpha, - m->input_activation, - input_type, - in_dim, - m->low_rank_activation, - lr_actv_type, - rank, - &alpha, - weight_first_grad_ptr, - weight_type, - in_dim, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Compute input gradiant - // NOTE: we use alpha=1 for input_grad to accumulate gradients - if (input_grad_ptr != nullptr) { + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) != + m->model_weights.end()); + LoraLinearWeight weight = + m->model_weights[bc->requestsInfo[i].peft_model_id]; + int rank = weight.rank; + // Compute w1's gradiant + // NOTE: we use alpha=1 for w1_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, - CUBLAS_OP_N, - in_dim, + CUBLAS_OP_T, + rank, + out_dim, num_peft_tokens, + &alpha, + m->low_rank_activation, + lr_actv_type, rank, + output_grad_ptr + tokens_previous_requests * out_dim, + output_type, + out_dim, &alpha, - weight_first_ptr, + weight.w1_grad_ptr, weight_type, - in_dim, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Compute gradiants w.r.t. low_rank activation + // and save the results to low_rank_activation + // NOTE: we use alpha=1 for input_grad to accumulate gradients + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + rank, + num_peft_tokens, + out_dim, + &alpha, + weight.w1_ptr, + weight_type, + rank, + output_grad_ptr + tokens_previous_requests * out_dim, + output_type, + out_dim, + &alpha, m->low_rank_activation, lr_actv_type, rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Compute w0's gradiant + // NOTE: we use alpha=1 for kernel_grad to accumulate gradients + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + in_dim, + rank, + num_peft_tokens, &alpha, - input_grad_ptr, + m->input_activation, input_type, in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &alpha, + weight.w0_grad_ptr, + weight_type, + in_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // Compute input gradiant + // NOTE: we use alpha=1 for input_grad to accumulate gradients + if (input_grad_ptr != nullptr) { + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + rank, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &alpha, + input_grad_ptr + tokens_previous_requests * in_dim, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + tokens_previous_requests += num_peft_tokens; } + assert(tokens_previous_requests == bc->num_active_tokens()); } } // namespace Internal diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 43d1b4cef1..665c5cb4c5 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -4,6 +4,7 @@ #include "flexflow/model.h" #include "flexflow/ops/kernels/lora_linear_kernels.h" #include "flexflow/utils/hash_utils.h" +#include "flexflow/utils/peft_weight_allocator.h" #include "legion/legion_utilities.h" namespace FlexFlow { @@ -32,168 +33,73 @@ using namespace FlexFlow::Kernels::LoraLinear; void FFModel::lora_linear(Tensor const input, Tensor const output, - int rank, - DataType data_type, - Initializer *kernel_initializer, char const *name) { - if (data_type == DT_NONE) { - data_type = input->data_type; - } - assert(data_type == input->data_type); - assert(data_type == output->data_type); - Layer *li = nullptr; - li = new Layer(this, - OP_LORA_LINEAR, - data_type, - name, - 2 /*inputs*/, - 2 /*weights*/, - 1 /*outputs*/, - input, - output); + assert(input->data_type == output->data_type); + Layer *lora = nullptr; + lora = new Layer(this, + OP_LORA_LINEAR, + output->data_type, + name, + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); { int numdims = output->num_dims; int dims[MAX_TENSOR_DIM]; for (int i = 0; i < numdims; i++) { dims[i] = output->dims[i]; } - li->outputs[0] = create_tensor_legion_ordering( - numdims, dims, data_type, li, 0, true /*create_grad*/); - } - { - int dims[2] = {input->dims[0], rank}; - li->weights[0] = create_weight_legion_ordering(2, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - { - int dims[2] = {rank, output->dims[0]}; - li->weights[1] = create_weight_legion_ordering(2, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); + lora->outputs[0] = create_tensor_legion_ordering( + numdims, dims, output->data_type, lora, 0, true /*create_grad*/); } - li->add_int_property("rank", rank); - layers.push_back(li); + layers.push_back(lora); } Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { - long long value; - layer->get_int_property("rank", value); - int rank = (int)value; - return new LoraLinear(model, - layer->layer_guid, - inputs[0], - inputs[1], - rank, - layer->data_type, - false /*allocate_weights*/, - layer->name); + return new LoraLinear( + model, layer->layer_guid, inputs[0], inputs[1], layer->name); } LoraLinear::LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, - ParallelTensor const output, - bool allocate_weights) - : LoraLinear(model, - other.layer_guid, - input, - output, - other.rank, - other.data_type, - allocate_weights, - other.name) {} + ParallelTensor const output) + : LoraLinear(model, other.layer_guid, input, output, other.name) {} LoraLinear::LoraLinear(FFModel &model, Params const ¶ms, Input const &inputs, - bool allocate_weights, char const *name) - : LoraLinear(model, - params.layer_guid, - inputs.first, - inputs.second, - params.rank, - params.data_type, - allocate_weights, - name) {} + : LoraLinear(model, params.layer_guid, inputs.first, inputs.second, name) {} LoraLinear::LoraLinear(FFModel &model, LayerID const &_layer_guid, ParallelTensor const _input, ParallelTensor const _output, - int _rank, - DataType _data_type, - bool allocate_weights, char const *name) : Op(model, OP_LORA_LINEAR, - _data_type, + _output->data_type, name, 2 /*inputs*/, - 2 /*weights*/, - allocate_weights, + 0 /*weights*/, + false, 1 /*outputs*/, _input, - _output), - rank(_rank) { + _output) { + assert(_input->data_type == _output->data_type); // overwrite layer_guid layer_guid = _layer_guid; - data_type = _data_type; + data_type = _output->data_type; ParallelTensorShape input_shape = this->inputs[0]->get_shape(); LoraLinearParams params = this->get_params(); - if (allocate_weights) { - Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); - // create weight first - { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[1] = inputs[0]->dims[num_dims - 1]; // data parallel - dims[1].size = dims[1].degree; - dims[1].is_replica_dim = true; - dims[0] = inputs[0]->dims[0]; - dims[0].size = inputs[0]->dims[0].size * rank; - weights[0] = - model.create_parallel_weight_legion_ordering(2, - dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - // create weight second - { - ParallelDim dims[2]; - int num_dims = inputs[0]->num_dims; - dims[1] = inputs[0]->dims[0]; - dims[1].size = dims[1].degree; - dims[1].is_replica_dim = true; - dims[0] = inputs[1]->dims[0]; - dims[0].size = inputs[1]->dims[0].size * rank; - weights[1] = - model.create_parallel_weight_legion_ordering(2, - dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - } // Create output tensor { int numdim = inputs[1]->num_dims; @@ -253,18 +159,6 @@ void LoraLinear::init_inference( EXCLUSIVE, batch_inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, output_tensor); @@ -295,27 +189,11 @@ OpMeta *LoraLinear::init_task(Task const *task, FID_DATA, ctx, runtime); - GenericTensorAccessorW weight_first = - helperGetGenericTensorAccessorRW(lora->weights[0]->data_type, - regions[2], - task->regions[2], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorW weight_second = - helperGetGenericTensorAccessorRW(lora->weights[1]->data_type, - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; - int rank = lora->rank; int batch_size = output.domain.get_volume() / out_dim; assert(input.domain.get_volume() == in_dim * batch_size); - assert(weight_first.domain.get_volume() == in_dim * rank); - assert(weight_second.domain.get_volume() == out_dim * rank); + assert(output.domain.get_volume() == out_dim * batch_size); LoraLinearMeta *m = new LoraLinearMeta(handle, lora); m->trainable_inputs[0] = lora->trainable_inputs[0]; @@ -324,6 +202,96 @@ OpMeta *LoraLinear::init_task(Task const *task, return m; } +struct LoraLinearRegisterInfo { + LoraLinear const *lora; + PEFTModelID model_id; + int rank; +}; + +void LoraLinear::register_peft_model( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv, + PEFTModelID const &model_id, + int rank) { + assert(check_output_input_weight_same_parallel_is()); + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); + // assert(check_output_input_weight_same_machine_view()); + // output is considered as an input to allow in-place optimization + ParallelTensor output_tensor = batch_outputs[0]; + parallel_is = output_tensor->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_inference(ff, argmap, output_tensor); + LoraLinearRegisterInfo info; + info.lora = this; + info.model_id = model_id; + info.rank = rank; + IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID, + parallel_is, + TaskArgument(&info, sizeof(LoraLinearRegisterInfo)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); +} + +void LoraLinear::register_model_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LoraLinearRegisterInfo const *info = + static_cast(task->args); + LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + LoraLinear const *lora = info->lora; + int rank = info->rank; + int num_dims = lora->inputs[0]->num_dims; + int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree; + int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; + DataType dt = m->input_type[0]; + assert(dt == m->input_type[1]); + assert(dt == m->output_type[1]); + assert(dt == lora->inputs[0]->data_type); + assert(m->model_weights.find(info->model_id) == m->model_weights.end()); + LoraLinearWeight weight; + PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; + weight.w0_ptr = allocator->allocate_local_weights_untyped( + info->model_id, rank * in_dim * data_type_size(dt)); + weight.w1_ptr = allocator->allocate_local_weights_untyped( + info->model_id, rank * out_dim * data_type_size(dt)); + weight.rank = rank; + if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { + // Input is partitioned (no replication) + // w0_grad is local weight gradients + weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( + info->model_id, rank * in_dim * data_type_size(dt)); + // w1_grad is sync weight gradients + weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( + info->model_id, rank * out_dim * data_type_size(dt)); + } else { + // Input is replicated + // w0_grad is sync weight gradients + weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( + info->model_id, rank * in_dim * data_type_size(dt)); + // w1_grad is local weight gradients + weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( + info->model_id, rank * out_dim * data_type_size(dt)); + } + m->model_weights[info->model_id] = weight; +} + void LoraLinear::forward(FFModel const &ff) { assert(false && "LoraLinear does not support normal init"); } @@ -372,18 +340,6 @@ FutureMap EXCLUSIVE, batch_inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); return runtime->execute_index_space(ctx, launcher); } @@ -404,28 +360,12 @@ void LoraLinear::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorRW( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); - int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; - int rank = weight_first.domain.get_volume() / in_dim; - assert(in_dim * rank == weight_first.domain.get_volume()); - assert(out_dim * rank == weight_second.domain.get_volume()); - - int num_infr_tokens = bc->num_active_infr_tokens(); - int num_peft_tokens = bc->num_active_peft_tokens(); - inference_kernel_wrapper(m, - input.ptr, - output.ptr, - weight_first.ptr, - weight_second.ptr, - in_dim, - out_dim, - rank, - num_infr_tokens, - num_peft_tokens); + // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + // int num_infr_tokens = bc->num_active_infr_tokens(); + // int num_peft_tokens = bc->num_active_peft_tokens(); + inference_kernel_wrapper(m, bc, input, output); } FutureMap LoraLinear::peft_bwd(FFModel const &ff, @@ -468,30 +408,6 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, EXCLUSIVE, batch_inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(2, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[0]->region_grad)); - launcher.add_field(4, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - weights[1]->region_grad)); - launcher.add_field(5, FID_DATA); return runtime->execute_index_space(ctx, launcher); } @@ -512,39 +428,14 @@ void LoraLinear::peft_bwd_task(Task const *task, GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight_first = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight_second = helperGetGenericTensorAccessorRO( - m->weight_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); - GenericTensorAccessorW weight_first_grad = helperGetGenericTensorAccessorRW( - m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); - GenericTensorAccessorW weight_second_grad = helperGetGenericTensorAccessorRW( - m->weight_type[1], regions[5], task->regions[5], FID_DATA, ctx, runtime); - - int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; - int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; - int rank = weight_first.domain.get_volume() / in_dim; - assert(in_dim * rank == weight_first.domain.get_volume()); - assert(out_dim * rank == weight_second.domain.get_volume()); - assert(weight_first.domain == weight_first_grad.domain); - assert(weight_second.domain == weight_second_grad.domain); - - int num_infr_tokens = bc->num_active_infr_tokens(); - int num_peft_tokens = bc->num_active_peft_tokens(); - peft_bwd_kernel_wrapper(m, - input_grad.ptr, - output_grad.ptr, - weight_first.ptr, - weight_second.ptr, - weight_first_grad.ptr, - weight_second_grad.ptr, - in_dim, - out_dim, - rank, - num_infr_tokens, - num_peft_tokens); + + // int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + // int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + // int num_infr_tokens = bc->num_active_infr_tokens(); + // int num_peft_tokens = bc->num_active_peft_tokens(); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); } void LoraLinear::backward(FFModel const &ff) { @@ -571,15 +462,12 @@ bool LoraLinear::measure_operator_cost(Simulator *sim, } bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.rank == rhs.rank && - lhs.data_type == rhs.data_type; + return lhs.layer_guid == rhs.layer_guid; } void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); - sez.serialize(this->rank); - sez.serialize(this->data_type); } /* static */ @@ -589,18 +477,12 @@ Node LoraLinear::deserialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) { assert(num_inputs == 2); - int rank; - DataType data_type; size_t id, transformer_layer_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); LayerID layer_guid(id, transformer_layer_id); - dez.deserialize(rank); - dez.deserialize(data_type); LoraLinearParams params; - params.rank = rank; - params.data_type = data_type; params.layer_guid = layer_guid; return ff.get_or_create_node({inputs[0], inputs[1]}, params); } @@ -615,8 +497,6 @@ Op *LoraLinear::materialize(FFModel &ff, LoraLinearParams LoraLinear::get_params() const { LoraLinearParams params; params.layer_guid = this->layer_guid; - params.rank = this->rank; - params.data_type = this->data_type; return params; } @@ -633,8 +513,6 @@ size_t hash::operator()( FlexFlow::LoraLinearParams const ¶ms) const { size_t key = 0; hash_combine(key, params.layer_guid.id); - hash_combine(key, params.rank); - hash_combine(key, params.data_type); return key; } }; // namespace std diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 2b94f07999..4c24af85cf 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -25,4 +25,20 @@ bool operator==(LayerID const &lhs, LayerID const &rhs) { return lhs.id == rhs.id; } +const PEFTModelID PEFTModelID::NO_ID = PEFTModelID(); + +PEFTModelID::PEFTModelID() : id(0) {} + +PEFTModelID::PEFTModelID(size_t _id) : id(_id) { + assert(is_valid_id()); +} + +bool PEFTModelID::is_valid_id() const { + return (id >= PEFT_MODEL_ID_FIRST_VALID && id <= PEFT_MODEL_ID_LAST_VALID); +} + +bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { + return lhs.id == rhs.id; +} + }; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2735513af2..50b9f5e402 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6235,6 +6235,22 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(LORA_LINEAR_REG_TASK_ID, + "LoraLinear Model Registration"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear Model Registration Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } { TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID, "LoraLinear Inference"); From ea8920b02af693b364f2a7986a5ce9e761ed4f11 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 7 Oct 2023 23:38:28 -0400 Subject: [PATCH 009/198] add API for registering PEFT models --- include/flexflow/model.h | 7 ++- include/flexflow/ops/lora_linear.h | 1 - include/flexflow/request_manager.h | 13 +++-- inference/models/llama.cc | 2 +- src/ops/lora_linear.cc | 3 +- src/runtime/model.cc | 1 + src/runtime/request_manager.cc | 80 ++++++++++++++++++++++++++---- 7 files changed, 86 insertions(+), 21 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index cc8d2267cf..f98456a268 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -815,7 +815,10 @@ class FFModel { // Inference APIs // ======================================== GenerationResult generate(std::vector &prompts, - int max_seq_length); + int max_seq_length, + PEFTModelID peft_model_id = PEFTModelID::NO_ID); + + PEFTModelID register_peft_model(std::map config); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], @@ -1112,7 +1115,7 @@ class FFModel { void clear_graph_search_cache(); public: - size_t op_global_guid, layer_global_guid; + size_t op_global_guid, layer_global_guid, peft_model_global_guid; size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid; size_t current_transformer_layer_id; // positional embedding start offset diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 39d8925262..23dc8ec496 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -41,7 +41,6 @@ class LoraLinear : public Op { void register_peft_model(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, - MachineView const *mv, PEFTModelID const &model_id, int rank); Legion::FutureMap inference(FFModel const &, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 3081aaa1c2..da64ac58a2 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -57,6 +57,7 @@ struct Request { FINISHING = 104, // finishing request, but not yet verified }; BatchConfig::RequestGuid guid; + PEFTModelID peft_model_id; int max_sequence_length; int initial_len; int ssm_cache_size = 0; @@ -112,15 +113,19 @@ class RequestManager { GenerationResult generate_incr_decoding(FFModel *model, std::vector &prompts, - int max_seq_length); + int max_seq_length, + PEFTModelID peft_model_id); GenerationResult generate_spec_infer(FFModel *model, std::vector &prompts, - int max_seq_length); + int max_seq_length, + PEFTModelID peft_model_id); GenerationResult get_generation_result(RequestGuid const &guid); RequestGuid register_new_request(std::string const &prompt, - int max_sequence_length); + int max_sequence_length, + PEFTModelID peft_model_id); RequestGuid register_new_request(std::vector const &prompt, - int max_sequence_length); + int max_sequence_length, + PEFTModelID peft_model_id); bool is_request_completed(RequestGuid const &guid); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index f90040170e..2fe5642507 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -220,7 +220,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string("layers_" + std::to_string(i) + "_feed_forward_w2") .c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear(multi, w2); + ff.lora_linear(multi, w2, "lora_mlp_linear_second"); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 665c5cb4c5..4c92d6cb6c 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -212,7 +212,6 @@ void LoraLinear::register_peft_model( FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, - MachineView const *mv, PEFTModelID const &model_id, int rank) { assert(check_output_input_weight_same_parallel_is()); @@ -229,7 +228,7 @@ void LoraLinear::register_peft_model( ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - MachineView const *view = mv ? mv : &output_tensor->machine_view; + MachineView const *view = &output_tensor->machine_view; size_t machine_view_hash = view->hash(); set_argumentmap_for_inference(ff, argmap, output_tensor); LoraLinearRegisterInfo info; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 50b9f5e402..91361e0cc7 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1535,6 +1535,7 @@ FFRuntime *ffruntime_singleton = nullptr; FFModel::FFModel(FFConfig &_config, bool cpu_offload) : op_global_guid(OP_GUID_FIRST_VALID), layer_global_guid(LAYER_GUID_FIRST_VALID), + peft_model_global_guid(PEFT_MODEL_ID_FIRST_VALID), tensor_global_guid(TENSOR_GUID_FIRST_VALID), parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID), node_global_guid(NODE_GUID_FIRST_VALID), current_transformer_layer_id(0), diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1b825318dd..1616054148 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -14,6 +14,7 @@ */ #include "flexflow/request_manager.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" #include @@ -175,7 +176,8 @@ size_t RequestManager::get_num_ssms() { RequestManager::RequestGuid RequestManager::register_new_request(std::vector const &prompt, - int max_sequence_length) { + int max_sequence_length, + PEFTModelID peft_model_id) { const std::lock_guard lock(request_queue_mutex); // Add a new request @@ -183,6 +185,7 @@ RequestManager::RequestGuid request.status = Request::PENDING; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; + request.peft_model_id = peft_model_id; if (prompt.size() >= get_max_sequence_length()) { std::cout << "Warning: too many tokens in prompt, only load up to " @@ -231,13 +234,15 @@ RequestManager::RequestGuid RequestManager::RequestGuid RequestManager::register_new_request(std::string const &prompt, - int max_sequence_length) { + int max_sequence_length, + PEFTModelID peft_model_id) { const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; + request.peft_model_id = peft_model_id; if (bos_token_id >= 0 && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } @@ -439,6 +444,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; if (new_bc.requestsInfo[i].token_start_offset + 1 == request.tokens.size()) { // Incremental phase @@ -477,6 +484,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; new_bc.request_completed[i] = false; // add profile_info for the new request ProfileInfo profile_info; @@ -1795,24 +1803,71 @@ std::vector> } GenerationResult FFModel::generate(std::vector &prompts, - int max_seq_length) { + int max_seq_length, + PEFTModelID peft_model_id) { RequestManager *rm = RequestManager::get_request_manager(); if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding - return rm->generate_incr_decoding(this, prompts, max_seq_length); + return rm->generate_incr_decoding( + this, prompts, max_seq_length, peft_model_id); } else { // Registered SSMs: perform speculative inference - return rm->generate_spec_infer(this, prompts, max_seq_length); + return rm->generate_spec_infer( + this, prompts, max_seq_length, peft_model_id); } } +PEFTModelID FFModel::register_peft_model(std::map configs) { + PEFTModelID peft_model_id(peft_model_global_guid++); + InferenceManager *im = InferenceManager::get_inference_manager(); + for (size_t op = 0; op < operators.size(); op++) { + if (operators[op]->op_type == OP_LORA_LINEAR) { + std::string opname(operators[op]->name); + // Remove the guid and the ``_'' char from opname: guid has 7 digits + // and ``_'' occupies 1 char + opname.erase(opname.length() - 8); + assert(configs.find(opname) != configs.end()); + int rank = configs[opname]; + LoraLinear *lora = static_cast(operators[op]); + // Currently assume only a single data pipeline + assert(config.data_parallelism_degree == 1); + std::vector inputs(lora->numInputs); + std::vector outputs(lora->numOutputs); + + for (int i = 0; i < lora->numInputs; i++) { + assert(im->tensor_buffer.find(lora->inputs[i]) != + im->tensor_buffer.end()); + assert(lora->inputs[i] != nullptr); + assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(im->tensor_buffer[lora->inputs[i]].size() == 1); + inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + for (int i = 0; i < lora->numOutputs; i++) { + assert(im->tensor_buffer.find(lora->outputs[i]) != + im->tensor_buffer.end()); + assert(lora->outputs[i] != nullptr); + assert(lora->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(im->tensor_buffer[lora->outputs[i]].size() == 1); + outputs[i] = im->tensor_buffer[lora->outputs[i]][0]; + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank); + } + } + return peft_model_id; +} + /*static*/ -GenerationResult RequestManager::generate_incr_decoding( - FFModel *llm, std::vector &prompts, int max_seq_length) { +GenerationResult + RequestManager::generate_incr_decoding(FFModel *llm, + std::vector &prompts, + int max_seq_length, + PEFTModelID peft_model_id) { InferenceManager *im = InferenceManager::get_inference_manager(); RequestGuid guid; for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length); + guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id); } if (guid == 0) { @@ -1864,12 +1919,15 @@ GenerationResult RequestManager::generate_incr_decoding( } /*static*/ -GenerationResult RequestManager::generate_spec_infer( - FFModel *llm, std::vector &prompts, int max_seq_length) { +GenerationResult + RequestManager::generate_spec_infer(FFModel *llm, + std::vector &prompts, + int max_seq_length, + PEFTModelID peft_model_id) { InferenceManager *im = InferenceManager::get_inference_manager(); RequestGuid guid; for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length); + guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id); } if (guid == 0) { std::cout From 44cc16b314d4241f6303519a00537a20cc66c3b2 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 8 Oct 2023 15:52:47 -0400 Subject: [PATCH 010/198] bug fix --- src/ops/fused.cu | 11 ++--------- src/ops/kernels/lora_linear_kernels.cu | 5 ++--- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index f6d8365f1f..948b8c0885 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -707,23 +707,16 @@ __host__ void assert(fused->op_num_outputs[op] == 1); Domain input_domain = my_input_accessor[0].domain; Domain output_domain = my_output_accessor[0].domain; - Domain weight_first_domain = my_weight_accessor[0].domain; - Domain weight_second_domain = my_weight_accessor[1].domain; int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; - int rank = weight_first_domain.get_volume() / in_dim; - assert(in_dim * rank == weight_first_domain.get_volume()); - assert(out_dim * rank == weight_second_domain.get_volume()); int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == + in_dim * batch_size); LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == 2); assert(m->input_type[0] == my_input_accessor[0].data_type); assert(m->output_type[0] == my_output_accessor[0].data_type); - int num_infr_tokens = bc->num_active_infr_tokens(); - int num_peft_tokens = bc->num_active_peft_tokens(); // Assert that the output and the second input are at the same place // since we ``inplace'' the output for LoRA assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 1e9069fa72..ab1ae1b49d 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -141,11 +141,10 @@ void inference_kernel(LoraLinearMeta *m, checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); DT alpha = 1.0f, beta = 0.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); - cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); - assert(m->weight_type[1] == m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); cudaDataType_t lr_actv_type = output_type; - assert(input_type == weight_type && weight_type == output_type); + assert(input_type == output_type); + cudaDataType_t weight_type = output_type; #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance From 29e5547cb7d4381db129131d52a14345dfb94b22 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 8 Oct 2023 15:53:13 -0400 Subject: [PATCH 011/198] format --- src/ops/fused.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 948b8c0885..d70d01013c 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -712,8 +712,7 @@ __host__ void int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == - in_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; assert(m->input_type[0] == my_input_accessor[0].data_type); assert(m->output_type[0] == my_output_accessor[0].data_type); From dfd1c9a0a8e28e937445fb0fdd4ea0786ca7c2f7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 8 Oct 2023 18:03:58 -0400 Subject: [PATCH 012/198] add reserved work space for peft activations and weights --- include/flexflow/config.h | 8 ++- inference/incr_decoding/incr_decoding.cc | 7 +- src/ops/kernels/lora_linear_kernels.cu | 11 +++- src/ops/lora_linear.cc | 4 +- src/runtime/model.cc | 12 ++++ src/runtime/model.cu | 50 ++++++++++++++ src/runtime/request_manager.cc | 83 +++++++++++++++--------- 7 files changed, 140 insertions(+), 35 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 60d1cb17d2..6fd4b957dc 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -80,10 +80,10 @@ struct FFHandler { void *offload_reserve_space; size_t offload_reserve_space_size; // PEFT related fields - void *peft_activation_reserve_space; - size_t peft_activation_reserve_space_size; MemoryAllocator *peft_activation_allocator; + size_t peft_activation_reserve_space_size; PEFTWeightAllocator *peft_weight_allocator; + size_t peft_weight_reserve_space_size; // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; @@ -96,6 +96,7 @@ struct FFInitInfo { size_t workSpaceSize; size_t offload_reserve_space_size; size_t peft_activation_reserve_space_size; + size_t peft_weight_reserve_space_size; DataType quantization_type; bool allowTensorOpMathConversion; // int myRank, allRanks; @@ -151,6 +152,9 @@ class FFConfig { bool cpu_offload; size_t offload_reserve_space_size; DataType quantization_type; + // PEFT related fields + size_t peft_activation_reserve_space_size; + size_t peft_weight_reserve_space_size; // Control parallelizable dimensions bool only_data_parallel; bool enable_sample_parallel; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 463bc10151..277d86c9cc 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -257,6 +257,11 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + // Register PEFT layer + std::map peft_config; + peft_config["lora_mlp_linear_second"] = 4; + PEFTModelID peft_model_id = model.register_peft_model(peft_config); + int total_num_requests = 0; { using json = nlohmann::json; @@ -274,7 +279,7 @@ void FlexFlow::top_level_task(Task const *task, prompts.push_back(text); } GenerationResult result = - model.generate(prompts, 128 /*max_sequence_length*/); + model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id); } // Execution fence diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index ab1ae1b49d..eab98a24e7 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -183,6 +183,7 @@ void inference_kernel(LoraLinearMeta *m, LoraLinearWeight weight = m->model_weights[bc->requestsInfo[i].peft_model_id]; int rank = weight.rank; + void *intermediate_result_ptr = nullptr; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; m->input_activation = allocator->allocate_instance_untyped( @@ -196,6 +197,12 @@ void inference_kernel(LoraLinearMeta *m, num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); + intermediate_result_ptr = m->low_rank_activation; + } else { + // use workspace to save intermediate result + assert(m->handle.workSpaceSize >= + data_type_size(m->input_type[1]) * num_peft_tokens * rank); + intermediate_result_ptr = m->handle.workSpace; } // buffer = weight_first * input checkCUDA(cublasGemmEx(m->handle.blas, @@ -212,7 +219,7 @@ void inference_kernel(LoraLinearMeta *m, input_type, in_dim, &beta, - m->low_rank_activation, + intermediate_result_ptr, lr_actv_type, rank, compute_type, @@ -230,7 +237,7 @@ void inference_kernel(LoraLinearMeta *m, weight.w1_ptr, weight_type, rank, - m->low_rank_activation, + intermediate_result_ptr, lr_actv_type, rank, &alpha, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 4c92d6cb6c..17ab2d659b 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -261,8 +261,10 @@ void LoraLinear::register_model_task(Task const *task, int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; DataType dt = m->input_type[0]; assert(dt == m->input_type[1]); - assert(dt == m->output_type[1]); + assert(dt == m->output_type[0]); assert(dt == lora->inputs[0]->data_type); + assert(dt == lora->inputs[1]->data_type); + assert(dt == lora->outputs[0]->data_type); assert(m->model_weights.find(info->model_id) == m->model_weights.end()); LoraLinearWeight weight; PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 91361e0cc7..e74e5e11aa 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1508,6 +1508,9 @@ FFRuntime::FFRuntime(FFConfig &config) { info.workSpaceSize = config.workSpaceSize; info.offload_reserve_space_size = config.cpu_offload ? config.offload_reserve_space_size : 0; + info.peft_activation_reserve_space_size = + config.peft_activation_reserve_space_size; + info.peft_weight_reserve_space_size = config.peft_weight_reserve_space_size; info.quantization_type = config.quantization_type; info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion; argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo))); @@ -3991,6 +3994,11 @@ struct DefaultConfig { const static bool searchOverlapBackwardUpdate = false; const static size_t offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB + // PEFT related fields + const static size_t peftActivationReserveSpaceSize = + (size_t)1 * 1024 * 1024 * 1024; // 1GB + const static size_t peftWeightReserveSpaceSize = + (size_t)1 * 1024 * 1024 * 1024; // 1GB const static bool cpuOffload = false; const static bool onlyDataParallel = true; const static bool enableSampleParallel = true; @@ -4025,6 +4033,10 @@ FFConfig::FFConfig() { computationMode = COMP_MODE_TRAINING; cpu_offload = DefaultConfig::cpuOffload; offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize; + // PEFT related fields + peft_activation_reserve_space_size = + DefaultConfig::peftActivationReserveSpaceSize; + peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize; quantization_type = DT_NONE; only_data_parallel = DefaultConfig::onlyDataParallel; data_parallelism_degree = 1; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 17401a0f14..0c69c9a600 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -14,6 +14,8 @@ */ #include "flexflow/model.h" #include "flexflow/utils/cuda_helper.h" +#include "flexflow/utils/memory_allocator.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { // declare Legion names @@ -152,6 +154,54 @@ FFHandler handle.offload_reserve_space = nullptr; } + if (info->peft_activation_reserve_space_size > 0) { + // allocate memory for peft activation reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(info->peft_activation_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); + handle.peft_activation_allocator = new MemoryAllocator(gpu_mem); + handle.peft_activation_allocator->register_reserved_work_space( + ptr, info->peft_activation_reserve_space_size); + } + + if (info->peft_weight_reserve_space_size > 0) { + // allocate memory for peft weight reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); + handle.peft_weight_allocator = + new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size); + } // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1616054148..05eb3bb554 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -14,6 +14,7 @@ */ #include "flexflow/request_manager.h" +#include "flexflow/ops/fused.h" #include "flexflow/ops/lora_linear.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" @@ -1817,42 +1818,66 @@ GenerationResult FFModel::generate(std::vector &prompts, } } +std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { + for (size_t i = 0; i < model->layers.size(); i++) { + if (model->layers[i]->layer_guid == guid) { + std::string layer_name(model->layers[i]->name); + return layer_name; + } + } + assert(false); + return "invalid_layer_name"; +} + PEFTModelID FFModel::register_peft_model(std::map configs) { PEFTModelID peft_model_id(peft_model_global_guid++); InferenceManager *im = InferenceManager::get_inference_manager(); + std::vector peft_operators; for (size_t op = 0; op < operators.size(); op++) { if (operators[op]->op_type == OP_LORA_LINEAR) { - std::string opname(operators[op]->name); - // Remove the guid and the ``_'' char from opname: guid has 7 digits - // and ``_'' occupies 1 char - opname.erase(opname.length() - 8); - assert(configs.find(opname) != configs.end()); - int rank = configs[opname]; - LoraLinear *lora = static_cast(operators[op]); - // Currently assume only a single data pipeline - assert(config.data_parallelism_degree == 1); - std::vector inputs(lora->numInputs); - std::vector outputs(lora->numOutputs); - - for (int i = 0; i < lora->numInputs; i++) { - assert(im->tensor_buffer.find(lora->inputs[i]) != - im->tensor_buffer.end()); - assert(lora->inputs[i] != nullptr); - assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(im->tensor_buffer[lora->inputs[i]].size() == 1); - inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; - assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + peft_operators.push_back(operators[op]); + } else if (operators[op]->op_type == OP_FUSED) { + FusedOp *fused = static_cast(operators[op]); + for (size_t op2 = 0; op2 < fused->numOperators; op2++) { + if (fused->operators[op2]->op_type == OP_LORA_LINEAR) { + peft_operators.push_back(fused->operators[op2]); + } + } + } + } + for (size_t op = 0; op < peft_operators.size(); op++) { + std::string layer_name = + find_layer_name_from_guid(this, peft_operators[op]->layer_guid); + switch (peft_operators[op]->op_type) { + case OP_LORA_LINEAR: { + // Remove the guid and the ``_'' char from opname: guid has 7 digits + // and ``_'' occupies 1 char + layer_name = layer_name.erase(layer_name.length() - 8); + assert(configs.find(layer_name) != configs.end()); + int rank = configs[layer_name]; + LoraLinear *lora = static_cast(peft_operators[op]); + // Currently assume only a single data pipeline + assert(config.data_parallelism_degree == 1); + std::vector inputs(lora->numInputs); + std::vector outputs(lora->numOutputs); + + for (int i = 0; i < lora->numInputs; i++) { + assert(im->tensor_buffer.find(lora->inputs[i]) != + im->tensor_buffer.end()); + assert(lora->inputs[i] != nullptr); + assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(im->tensor_buffer[lora->inputs[i]].size() == 1); + inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + assert(lora->numOutputs == 1); + outputs[0] = inputs[1]; + lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank); + break; } - for (int i = 0; i < lora->numOutputs; i++) { - assert(im->tensor_buffer.find(lora->outputs[i]) != - im->tensor_buffer.end()); - assert(lora->outputs[i] != nullptr); - assert(lora->outputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(im->tensor_buffer[lora->outputs[i]].size() == 1); - outputs[i] = im->tensor_buffer[lora->outputs[i]][0]; - assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + default: { + assert(false && "Unsupported PEFT Operator type"); } - lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank); } } return peft_model_id; From e6f671d076a0ae08709081ef1f4d8f1b51802c83 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 11 Oct 2023 01:41:17 -0400 Subject: [PATCH 013/198] fix merge conflicts, implement layernorm peft_bwd --- include/flexflow/fftype.h | 2 + include/flexflow/model.h | 1 + include/flexflow/ops/layer_norm.h | 28 +++ src/ops/fused.cu | 10 +- src/ops/layer_norm.cc | 180 +++++++++++++--- src/ops/layer_norm.cpp | 62 ++++-- src/ops/layer_norm.cu | 274 ++++++++++++++++-------- src/ops/linear.cc | 2 +- src/ops/lora_linear.cc | 8 +- src/runtime/batch_config.cc | 9 +- src/runtime/beam_search_batch_config.cc | 3 + src/runtime/fftype.cc | 9 + src/runtime/model.cc | 15 ++ src/runtime/tree_verify_batch_config.cc | 3 + 14 files changed, 467 insertions(+), 139 deletions(-) diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 2722e00f9c..099b58c82e 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -26,6 +26,8 @@ class PEFTModelID { PEFTModelID(size_t id); bool is_valid_id() const; friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); + friend std::ostream &operator<<(std::ostream &os, + PEFTModelID const &peft_model_id); public: size_t id; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 0f33d2c7ea..e2530bcc90 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -106,6 +106,7 @@ enum TaskIDs { LAYERNORM_FWD_TASK_ID, LAYERNORM_INF_TASK_ID, LAYERNORM_BWD_TASK_ID, + LAYERNORM_PEFT_BWD_TASK_ID, RESIDUAL_LAYERNORM_INIT_TASK_ID, RESIDUAL_LAYERNORM_INF_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 9e48d81190..389b3e718a 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -37,6 +37,11 @@ class LayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -67,6 +72,10 @@ class LayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -86,6 +95,12 @@ class LayerNorm : public Op { GenericTensorAccessorW &output, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta); + static void inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -103,6 +118,17 @@ class LayerNorm : public Op { T const *gamma_ptr, T *gamma_grad_ptr, T *beta_grad_ptr); + template + static void peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + ffStream_t stream); + template + static void peft_bwd_kernel_wrapper(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr); public: bool elementwise_affine, use_bias; @@ -124,6 +150,8 @@ class LayerNormMeta : public OpMeta { float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; }; }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 071078b324..9aa4291453 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1127,14 +1127,20 @@ __host__ void } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back(weight_accessor[fused->op_weight_idx[i + woff]]); + weight_accessors_to_save.push_back( + weight_accessor[fused->op_weight_idx[i + woff]]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { output_accessors_to_save.push_back(output_accessor[i + ooff]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - FusedOp::save_inference_tensors_to_file(metas->meta[op], shard_id, bc, input_accessors_to_save, weight_accessors_to_save, output_accessors_to_save); + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index bc1358e49c..784e40c598 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/utils/hash_utils.h" #include "legion/legion_utilities.h" @@ -561,7 +562,7 @@ void LayerNorm::inference_task(Task const *task, assert(regions.size() == 2); } - LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); + LayerNorm::inference_kernel_wrapper(m, bc, in, out, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -645,6 +646,115 @@ void LayerNorm::forward_task(Task const *task, LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } +Legion::FutureMap + LayerNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): output_grad + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + // regions[1](I/O): input_grad + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(2, FID_DATA); + if (elementwise_affine) { + // regions[2](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I/O): input_grad + regions[2](I): gamma +*/ +void LayerNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); + assert(task->regions.size() == regions.size()); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + // GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + // m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, + // runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + Domain out_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + // Domain in_domain = runtime->get_index_space_domain( + // ctx, task->regions[1].region.get_index_space()); + Domain in_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + // assert(in_domain == out_grad_domain); + // assert(in_domain.get_volume() == + // m->effective_num_elements * m->effective_batch_size); + + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 3)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + Domain gamma_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(gamma_domain.get_volume() == m->effective_num_elements); + } else { + assert(regions.size() == 2); + } + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::peft_bwd_kernel_wrapper(m, + output_grad.get_float_ptr(), + // input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr()); + } else { + LayerNorm::peft_bwd_kernel_wrapper(m, + output_grad.get_half_ptr(), + // input.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr()); + } +} + void LayerNorm::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -722,55 +832,75 @@ void LayerNorm::backward_task(Task const *task, Runtime *runtime) { LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); assert(task->regions.size() == regions.size()); - float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL; - float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; Domain out_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain in_grad_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - in_grad_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); assert(in_domain == out_grad_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + gamma_grad = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[4], + task->regions[4], + FID_DATA, + ctx, + runtime); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRO( - regions[3], task->regions[3], FID_DATA, ctx, runtime); Domain gamma_grad_domain = runtime->get_index_space_domain( ctx, task->regions[4].region.get_index_space()); - gamma_grad_ptr = helperGetTensorPointerRW( - regions[4], task->regions[4], FID_DATA, ctx, runtime); if (m->use_bias) { Domain beta_grad_domain = runtime->get_index_space_domain( ctx, task->regions[5].region.get_index_space()); - beta_grad_ptr = helperGetTensorPointerRW( - regions[5], task->regions[5], FID_DATA, ctx, runtime); + beta_grad = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[5], + task->regions[5], + FID_DATA, + ctx, + runtime); assert(gamma_domain == beta_grad_domain); } - assert(gamma_domain == gamma_grad_domain); - assert(gamma_domain.get_volume() == m->effective_num_elements); } else { assert(regions.size() == 3); } - - LayerNorm::backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel_wrapper(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr()); + } else { + LayerNorm::backward_kernel_wrapper(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + gamma_grad.get_half_ptr(), + beta_grad.get_half_ptr()); + } } bool LayerNorm::measure_operator_cost(Simulator *sim, diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 07dbdb3dfb..9beb655d1d 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -236,13 +236,13 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, } template -__global__ void LayerNormBackwardCUDAKenrel(int64_t N, +__global__ void LayerNormBackwardCUDAKernel(int64_t N, T const *dY, T const *X, T const *gamma, - T const *a, - T const *b, - T const *c, + T const *dY_scale, + T const *X_scale, + T const *bias, T *dX) { using T_ACC = T; const int64_t i = blockIdx.x; @@ -250,9 +250,9 @@ __global__ void LayerNormBackwardCUDAKenrel(int64_t N, const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - dX[index] = - static_cast(a[i]) * static_cast(dY[index]) * gamma_v + - b[i] * static_cast(X[index]) + c[i]; + dX[index] = static_cast(dY_scale[i]) * + static_cast(dY[index]) * gamma_v + + X_scale[i] * static_cast(X[index]) + bias[i]; } } @@ -532,6 +532,19 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, beta_grad_ptr); } } + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->rstd_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr), + input_grad_ptr); } /*static*/ @@ -545,14 +558,25 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); + } } template void @@ -563,5 +587,13 @@ template void float const *gamma_ptr, float *gamma_grad_ptr, float *beta_grad_ptr); +template void + LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, + half const *output_grad_ptr, + half const *input_ptr, + half *input_grad_ptr, + half const *gamma_ptr, + half *gamma_grad_ptr, + half *beta_grad_ptr); }; // namespace FlexFlow diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 44979c48fe..cdf2ed433f 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -115,54 +115,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { return val; } -#ifdef DEADCODE -template -__global__ void RowwiseMomentsCUDAKernel( - int64_t N, float eps, T const *X, T *mean, T *rstd) { - __shared__ float m_shared[C10_WARP_SIZE]; - __shared__ float v_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; - float sum1 = 0.0f; - float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); - } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); - if (threadIdx.x == 0) { - float const scale = float(1) / static_cast(N); - sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, float(0)); - mean[i] = static_cast(sum1); - rstd[i] = static_cast(rsqrt(sum2 + eps)); - } -} - -template -__global__ void LayerNormForwardCUDAKernel(int64_t N, - T const *X, - T const *mean, - T const *rstd, - T const *gamma, - T const *beta, - T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - const T_ACC beta_v = - beta == nullptr ? T_ACC(0) : static_cast(beta[j]); - Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * - static_cast(rstd[i]) * gamma_v + - beta_v; - } -} -#endif - template __global__ void LayerNormFusedForwardKernel(int64_t N, float eps, @@ -290,6 +242,109 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, } } +/*static*/ +void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // check that at most one dimension after the first is > 1. TODO(goliaro): + // support case where this condition does not hold + int non_unit_dims_encountered = 0; + for (int i = 1; i < input.domain.get_dim(); i++) { + int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1; + if (dim_i > 1) { + non_unit_dims_encountered++; + } + } + assert(non_unit_dims_encountered <= 1); + + // allocate space for all peft tokens + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * bc->num_active_peft_tokens() * + in_dim); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests and PEFT forward-only requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + } +} + template __global__ void ComputeInternalGradientsCUDAKernel( int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { @@ -327,7 +382,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, using T_ACC = T; const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < M) { - const T_ACC s = T_ACC(1) / static_cast(N); + const T_ACC s = T_ACC(1) / static_cast((int)N); const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * static_cast(rstd[index]) * static_cast(rstd[index]) * @@ -338,27 +393,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, } } -template -__global__ void LayerNormBackwardCUDAKenrel(int64_t N, - T const *dY, - T const *X, - T const *gamma, - T const *a, - T const *b, - T const *c, - T *dX) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - dX[index] = - static_cast(a[i]) * static_cast(dY[index]) * gamma_v + - b[i] * static_cast(X[index]) + c[i]; - } -} - template __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, int64_t N, @@ -618,6 +652,59 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, } } +/*static*/ +template +void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); +} + +/*static*/ +template +void LayerNorm::peft_bwd_kernel_wrapper(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + LayerNorm::peft_bwd_kernel( + m, output_grad_ptr, input_grad_ptr, gamma_ptr, stream); +} + /*static*/ template void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, @@ -629,26 +716,14 @@ void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, T *beta_grad_ptr) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - if (m->output_type[0] == DT_FLOAT) { - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); - } - // }else if(m->output_type[0] == DT_HALF){ - // LayerNorm::backward_kernel(m, - // output_grad_ptr, - // input_ptr, - // input_grad_ptr, - // gamma_ptr, - // gamma_grad_ptr, - // beta_grad_ptr, - // stream); - // } + LayerNorm::backward_kernel(m, + output_grad_ptr, + input_ptr, + input_grad_ptr, + gamma_ptr, + gamma_grad_ptr, + beta_grad_ptr, + stream); } template void @@ -659,5 +734,24 @@ template void float const *gamma_ptr, float *gamma_grad_ptr, float *beta_grad_ptr); +template void + LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, + half const *output_grad_ptr, + half const *input_ptr, + half *input_grad_ptr, + half const *gamma_ptr, + half *gamma_grad_ptr, + half *beta_grad_ptr); + +template void + LayerNorm::peft_bwd_kernel_wrapper(LayerNormMeta const *m, + float const *output_grad_ptr, + float *input_grad_ptr, + float const *gamma_ptr); +template void + LayerNorm::peft_bwd_kernel_wrapper(LayerNormMeta const *m, + half const *output_grad_ptr, + half *input_grad_ptr, + half const *gamma_ptr); }; // namespace FlexFlow diff --git a/src/ops/linear.cc b/src/ops/linear.cc index ccc997b8e4..05529a46ec 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -505,7 +505,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->add_bias_only_once = linear->add_bias_only_once; m->profiling = linear->profiling; m->inference_debugging = linear->inference_debugging; - m->trainableInputs[0] = linear->trainableInputs[0]; + m->trainable_inputs[0] = linear->trainable_inputs[0]; m->weight_ptr_type = m->input_type[0]; m->quantization_type = linear->quantization_type; m->offload = linear->offload; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 17ab2d659b..be1015e065 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -469,6 +469,7 @@ bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); } /* static */ @@ -478,10 +479,11 @@ Node LoraLinear::deserialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) { assert(num_inputs == 2); - size_t id, transformer_layer_id; + size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); - LayerID layer_guid(id, transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); LoraLinearParams params; params.layer_guid = layer_guid; @@ -514,6 +516,8 @@ size_t hash::operator()( FlexFlow::LoraLinearParams const ¶ms) const { size_t key = 0; hash_combine(key, params.layer_guid.id); + hash_combine(key, params.layer_guid.transformer_layer_id); + hash_combine(key, params.layer_guid.model_id); return key; } }; // namespace std diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index badca4010e..32b9146f90 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -105,12 +105,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << "Max sequence length: " << bc.max_sequence_length() << std::endl; // Current values os << "Number of active tokens: " << bc.num_active_tokens() << std::endl; - os << "Number of inference tokens: " << bc.num_active_infr_tokens() << std::endl; + os << "Number of inference tokens: " << bc.num_active_infr_tokens() + << std::endl; os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl; os << "Number of requests: " << bc.num_active_requests() << std::endl; - // PEFT values - os << "PEFT Model ID: " << bc.peft_model_id << std::endl; - os << "PEFT bwd: " << bc.peft_bwd << std::endl; // Per-request info os << "Per-request info:\n"; @@ -122,6 +120,9 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 811ef00ba2..ee89450eca 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -131,6 +131,9 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index e8c3d49a6a..8213726e8a 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -46,4 +46,13 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { return lhs.id == rhs.id; } +std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { + if (peft_model_id == PEFTModelID::NO_ID) { + os << "NO_ID"; + } else { + os << peft_model_id.id; + } + return os; +} + }; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 476485414b..c23eb6c1d9 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5396,6 +5396,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID, + "layernorm_peft_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "peft_bwd_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index cb68ecc5f1..666a76790c 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -52,6 +52,9 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; From 207b127b38970c798aadfcdb2bbbb737f460a2a0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 11 Oct 2023 01:42:26 -0400 Subject: [PATCH 014/198] cleanup --- src/ops/layer_norm.cc | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 784e40c598..6409019dbe 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -706,24 +706,18 @@ void LayerNorm::peft_bwd_task(Task const *task, Runtime *runtime) { LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); assert(task->regions.size() == regions.size()); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - // GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - // m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, - // runtime); GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR gamma; GenericTensorAccessorW gamma_grad, beta_grad; + Domain out_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - // Domain in_domain = runtime->get_index_space_domain( - // ctx, task->regions[1].region.get_index_space()); Domain in_grad_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - // assert(in_domain == out_grad_domain); - // assert(in_domain.get_volume() == - // m->effective_num_elements * m->effective_batch_size); if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 3)); @@ -743,13 +737,11 @@ void LayerNorm::peft_bwd_task(Task const *task, if (m->output_type[0] == DT_FLOAT) { LayerNorm::peft_bwd_kernel_wrapper(m, output_grad.get_float_ptr(), - // input.get_float_ptr(), input_grad.get_float_ptr(), gamma.get_float_ptr()); } else { LayerNorm::peft_bwd_kernel_wrapper(m, output_grad.get_half_ptr(), - // input.get_half_ptr(), input_grad.get_half_ptr(), gamma.get_half_ptr()); } From 231e244e771c88f0447e69525e413004d07340c0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 11 Oct 2023 04:13:59 -0400 Subject: [PATCH 015/198] rms backward --- include/flexflow/model.h | 2 + .../flexflow/ops/kernels/rms_norm_kernels.h | 7 + include/flexflow/ops/rms_norm.h | 8 + src/ops/kernels/rms_norm_kernels.cu | 175 ++++++++++++++---- src/ops/layer_norm.cc | 4 +- src/ops/rms_norm.cc | 94 +++++++++- src/runtime/model.cc | 31 +++- 7 files changed, 275 insertions(+), 46 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index e2530bcc90..8e0a264e8f 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -166,6 +166,8 @@ enum TaskIDs { RMSNORM_INIT_TASK_ID, RMSNORM_FWD_TASK_ID, RMSNORM_INF_TASK_ID, + RMSNORM_BWD_TASK_ID, + RMSNORM_PEFT_BWD_TASK_ID, RESIDUAL_RMSNORM_INIT_TASK_ID, RESIDUAL_RMSNORM_INF_TASK_ID, BEAM_TOPK_INIT_TASK_ID, diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 35c5aa69fa..5844880b4b 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -30,6 +30,7 @@ class RMSNormMeta : public OpMeta { float eps; void *rms_ptr; void *norm_ptr; + void *c2_ptr; float alpha; float beta; @@ -46,6 +47,12 @@ void forward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output); +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad); } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index 1dc940ebd3..c22caaf69b 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -73,6 +73,14 @@ class RMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 7c9f4a9f98..2ec503cfd1 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -115,47 +115,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { return val; } -#ifdef DEADCODE -template -__global__ void - RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) { - __shared__ float v_shared[C10_WARP_SIZE]; - long long const i = blockIdx.x; - float sum = 0.0f; - for (long long j = threadIdx.x; j < N; j += blockDim.x) { - long long const index = i * N + j; - sum += (static_cast(X[index]) * static_cast(X[index])); - } - sum = BlockReduceSum(sum, - v_shared); // use BlockReduceSum() to sum X_ij^2 - - if (threadIdx.x == 0) { - rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); - } -} - -template -__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rstd[i]); - } -} - -template -__global__ void elewise_apply_weights(int64_t batch_size, - int64_t in_dim, - T const *norm, - T const *weights, - T *output) { - CUDA_KERNEL_LOOP(i, batch_size * in_dim) { - output[i] = norm[i] * weights[i % in_dim]; - } -} -#endif - template __global__ void RMSNormFusedForwardKernel(int64_t N, float eps, @@ -261,6 +220,140 @@ void forward_kernel_wrapper(RMSNormMeta const *m, } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ T ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += dY[index] * X[index] * gamma[j]; + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast((int)N); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + dX[index] = c1[i] * dY[index] * gamma[j] + c2[i] * X[index]; + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(RMSNormMeta const *m, + T const *output_grad_ptr, + T const *input_ptr, + T *input_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->batch_size; + const int64_t N = m->num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->c2_ptr)); + + RMSNormBackwardCUDAKernel + <<>>(N, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->c2_ptr), + input_grad_ptr); + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBackwardCUDAKernel + <<>>(M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(input_grad.data_type == input.data_type); + assert(weight_grad.data_type == weight.data_type); + assert(output_grad.data_type == input.data_type); + assert(weight.data_type == output_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 6409019dbe..b5ee66fdba 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -706,14 +706,14 @@ void LayerNorm::peft_bwd_task(Task const *task, Runtime *runtime) { LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); assert(task->regions.size() == regions.size()); - + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR gamma; GenericTensorAccessorW gamma_grad, beta_grad; - + Domain out_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); Domain in_grad_domain = runtime->get_index_space_domain( diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 2a34f83be2..83648b49cf 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -431,6 +431,98 @@ void RMSNorm::inference_task(Task const *task, } } +void RMSNorm::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RMSNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): output_grad + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I): input + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // regions[2](I/O): input_grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + // regions[3](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + // regions[4](I/O): gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(4, FID_DATA); + + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I): input + regions[2](I/O): input_grad + regions[3](I): weight + regions[4](I/O): weight_grad +*/ +void RMSNorm::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 5); + assert(regions.size() == 5); + RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + backward_kernel_wrapper( + m, output_grad, input, input_grad, weight, weight_grad); +} + +/* + regions[0](I): output_grad + regions[1](I): input + regions[2](I/O): input_grad + regions[3](I): weight + regions[4](I/O): weight_grad +*/ +void RMSNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) {} + void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); @@ -470,8 +562,6 @@ Op *RMSNorm::materialize(FFModel &ff, return new RMSNorm(ff, params, inputs[0], true, this->name); } -void RMSNorm::backward(FFModel const &ff) {} - bool RMSNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c23eb6c1d9..931173e5f3 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5363,7 +5363,36 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } - // rms norm task + { + TaskVariantRegistrar registrar(RMSNORM_BWD_TASK_ID, "RMS Norm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(RMSNORM_PEFT_BWD_TASK_ID, + "RMS Norm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // residual rms norm task { TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INIT_TASK_ID, "Residual RMS Norm Init"); From 416c322c48fe32e1d889462effc69bb8d50f6272 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 11 Oct 2023 05:03:29 -0400 Subject: [PATCH 016/198] rms peft --- .../flexflow/ops/kernels/rms_norm_kernels.h | 12 ++ include/flexflow/ops/rms_norm.h | 5 + src/ops/kernels/rms_norm_kernels.cu | 165 ++++++++++++++++++ src/ops/rms_norm.cc | 64 ++++++- 4 files changed, 240 insertions(+), 6 deletions(-) diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 5844880b4b..72176f0383 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H #include "flexflow/accessor.h" +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -39,6 +40,8 @@ class RMSNormMeta : public OpMeta { int batch_size; int num_elements; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; }; namespace Kernels { @@ -47,12 +50,21 @@ void forward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output); +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output); void backward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &input, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &weight_grad); +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight); } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index c22caaf69b..384404d8a0 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -34,6 +34,11 @@ class RMSNorm : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) override; void init_inference(FFModel const &, std::vector const &, std::vector const &, diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 2ec503cfd1..ffb92613a5 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -220,6 +220,103 @@ void forward_kernel_wrapper(RMSNormMeta const *m, } } +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(output.data_type == input.data_type); + assert(weight.data_type == output.data_type); + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // check that at most one dimension after the first is > 1. TODO(goliaro): + // support case where this condition does not hold + int non_unit_dims_encountered = 0; + for (int i = 1; i < input.domain.get_dim(); i++) { + int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1; + if (dim_i > 1) { + non_unit_dims_encountered++; + } + } + assert(non_unit_dims_encountered <= 1); + + // allocate space for all peft tokens + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(input.data_type) * bc->num_active_peft_tokens() * + in_dim); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests and PEFT forward-only requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + + if (input.data_type == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + tokens_previous_requests * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (input.data_type == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + tokens_previous_requests * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + template __global__ void ComputeInternalGradientsCUDAKernel( int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { @@ -354,6 +451,74 @@ void backward_kernel_wrapper(RMSNormMeta const *m, } } +template +void peft_bwd_kernel(RMSNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *weight_ptr, + cudaStream_t stream) { + const int64_t M = m->batch_size; + const int64_t N = m->num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->c2_ptr)); + RMSNormBackwardCUDAKernel + <<>>(N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->c2_ptr), + input_grad_ptr); +} + +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(input_grad.data_type == output_grad.data_type); + assert(output_grad.data_type == weight.data_type); + + if (output_grad.data_type == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 83648b49cf..332472e8e4 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -422,7 +422,7 @@ void RMSNorm::inference_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input, weight, output); + inference_kernel_wrapper(m, bc, input, weight, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -511,17 +511,69 @@ void RMSNorm::backward_task(Task const *task, m, output_grad, input, input_grad, weight, weight_grad); } +Legion::FutureMap + RMSNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): output_grad + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + // regions[1](I/O): input_grad + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(2, FID_DATA); + // regions[2](I): weight + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + + return runtime->execute_index_space(ctx, launcher); +} + /* regions[0](I): output_grad - regions[1](I): input - regions[2](I/O): input_grad - regions[3](I): weight - regions[4](I/O): weight_grad + regions[1](I/O): input_grad + regions[2](I): weight */ void RMSNorm::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, - Runtime *runtime) {} + Runtime *runtime) { + assert(task->regions.size() == 3); + assert(regions.size() == 3); + RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + peft_bwd_kernel_wrapper(m, output_grad, input_grad, weight); +} void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); From f72067a4561a769960952c980b9deb1d46684fa6 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 11 Oct 2023 12:15:47 -0400 Subject: [PATCH 017/198] add LoraLinearConfig --- include/flexflow/ffconst.h | 9 +++- include/flexflow/model.h | 5 +- include/flexflow/ops/lora_linear.h | 3 +- include/flexflow/ops/lora_linear_params.h | 17 +++++++ inference/file_loader.cc | 2 +- inference/incr_decoding/incr_decoding.cc | 6 +-- inference/models/llama.cc | 2 +- src/ops/fused.cu | 3 +- src/ops/lora_linear.cc | 39 +++++++++++----- src/ops/lora_linear_params.cc | 20 ++++++++ src/runtime/ffconst_utils.cc | 6 ++- src/runtime/graph.cc | 3 +- src/runtime/model.cc | 3 +- src/runtime/request_manager.cc | 57 +++++++++++++++++++---- 14 files changed, 141 insertions(+), 34 deletions(-) create mode 100644 src/ops/lora_linear_params.cc diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index efc37ce78d..6fe52e6892 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -46,6 +46,12 @@ enum LossType { LOSS_IDENTITY = 54, }; +enum OptimizerType { + OPTIMIZER_TYPE_NONE = 60, + OPTIMIZER_TYPE_SGD = 61, + OPTIMIZER_TYPE_ADAM = 62, +}; + enum CompMode { COMP_MODE_TRAINING = 70, COMP_MODE_INFERENCE = 71, @@ -173,7 +179,8 @@ enum OperatorType { OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, // PEFT Ops - OP_LORA_LINEAR, + OP_LORA_MLP_FIRST, + OP_LORA_MLP_SECOND, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 8e0a264e8f..8d6dd87e91 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -813,6 +813,7 @@ class FFModel { // ======================================== void lora_linear(Tensor const input, Tensor const output, + OperatorType _type, char const *name = nullptr); // ======================================== // Inference APIs @@ -821,7 +822,9 @@ class FFModel { int max_seq_length, PEFTModelID peft_model_id = PEFTModelID::NO_ID); - PEFTModelID register_peft_model(std::map config); + PEFTModelID register_peft_model( + LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig, + LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 23dc8ec496..b9aabdd1aa 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -19,6 +19,7 @@ class LoraLinear : public Op { LoraLinear(FFModel &model, LayerID const &layer_guid, + OperatorType type, ParallelTensor const input, ParallelTensor const output, char const *name = nullptr); @@ -42,7 +43,7 @@ class LoraLinear : public Op { std::vector const &batch_inputs, std::vector const &batch_outputs, PEFTModelID const &model_id, - int rank); + LoraLinearConfig const lora_config); Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 9eaee3000b..46ee4ac6b7 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -9,9 +9,26 @@ namespace FlexFlow { +class LoraLinearConfig { +public: + static const LoraLinearConfig DefaultConfig; + LoraLinearConfig(); + LoraLinearConfig(int rank, + OptimizerType type = OPTIMIZER_TYPE_SGD, + float learning_rate = 1e-4); + friend bool operator==(LoraLinearConfig const &lhs, + LoraLinearConfig const &rhs); + +public: + int rank; + OptimizerType optimizer_type; + float learning_rate; +}; + class LoraLinearParams { public: LayerID layer_guid; + OperatorType type; bool is_valid(std::pair const &input_shape) const; diff --git a/inference/file_loader.cc b/inference/file_loader.cc index f11df920e3..20c14f8f4f 100644 --- a/inference/file_loader.cc +++ b/inference/file_loader.cc @@ -765,7 +765,7 @@ void FileDataLoader::load_weights(FFModel *ff, bool use_full_precision) { continue; } // TODO: currently skip Lora layers - if (l->op_type == OP_LORA_LINEAR) { + if (l->op_type == OP_LORA_MLP_FIRST || l->op_type == OP_LORA_MLP_SECOND) { continue; } switch (weight->data_type) { diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 277d86c9cc..461d71b23a 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -258,9 +258,9 @@ void FlexFlow::top_level_task(Task const *task, } // Register PEFT layer - std::map peft_config; - peft_config["lora_mlp_linear_second"] = 4; - PEFTModelID peft_model_id = model.register_peft_model(peft_config); + LoraLinearConfig mlp_second(4 /*rank*/); + PEFTModelID peft_model_id = model.register_peft_model( + LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/); int total_num_requests = 0; { diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 2fe5642507..20e1f38ce9 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -220,7 +220,7 @@ void LLAMA::create_llama_model(FFModel &ff, std::string("layers_" + std::to_string(i) + "_feed_forward_w2") .c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear(multi, w2, "lora_mlp_linear_second"); + ff.lora_linear(multi, w2, OP_LORA_MLP_SECOND); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 9aa4291453..f404e305e6 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -702,7 +702,8 @@ __host__ void batch_size); break; } - case OP_LORA_LINEAR: { + case OP_LORA_MLP_FIRST: + case OP_LORA_MLP_SECOND: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); Domain input_domain = my_input_accessor[0].domain; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index be1015e065..2e356f7531 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -33,11 +33,12 @@ using namespace FlexFlow::Kernels::LoraLinear; void FFModel::lora_linear(Tensor const input, Tensor const output, + OperatorType op_type, char const *name) { assert(input->data_type == output->data_type); Layer *lora = nullptr; lora = new Layer(this, - OP_LORA_LINEAR, + op_type, output->data_type, name, 2 /*inputs*/, @@ -61,29 +62,40 @@ Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { - return new LoraLinear( - model, layer->layer_guid, inputs[0], inputs[1], layer->name); + return new LoraLinear(model, + layer->layer_guid, + layer->op_type, + inputs[0], + inputs[1], + layer->name); } LoraLinear::LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, ParallelTensor const output) - : LoraLinear(model, other.layer_guid, input, output, other.name) {} + : LoraLinear( + model, other.layer_guid, other.op_type, input, output, other.name) {} LoraLinear::LoraLinear(FFModel &model, Params const ¶ms, Input const &inputs, char const *name) - : LoraLinear(model, params.layer_guid, inputs.first, inputs.second, name) {} + : LoraLinear(model, + params.layer_guid, + params.type, + inputs.first, + inputs.second, + name) {} LoraLinear::LoraLinear(FFModel &model, LayerID const &_layer_guid, + OperatorType _op_type, ParallelTensor const _input, ParallelTensor const _output, char const *name) : Op(model, - OP_LORA_LINEAR, + _op_type, _output->data_type, name, 2 /*inputs*/, @@ -205,7 +217,7 @@ OpMeta *LoraLinear::init_task(Task const *task, struct LoraLinearRegisterInfo { LoraLinear const *lora; PEFTModelID model_id; - int rank; + LoraLinearConfig lora_config; }; void LoraLinear::register_peft_model( @@ -213,7 +225,7 @@ void LoraLinear::register_peft_model( std::vector const &batch_inputs, std::vector const &batch_outputs, PEFTModelID const &model_id, - int rank) { + LoraLinearConfig const lora_config) { assert(check_output_input_weight_same_parallel_is()); assert(batch_inputs.size() == 2); assert(batch_outputs.size() == 1); @@ -234,7 +246,7 @@ void LoraLinear::register_peft_model( LoraLinearRegisterInfo info; info.lora = this; info.model_id = model_id; - info.rank = rank; + info.lora_config = lora_config; IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID, parallel_is, TaskArgument(&info, sizeof(LoraLinearRegisterInfo)), @@ -255,7 +267,7 @@ void LoraLinear::register_model_task(Task const *task, static_cast(task->args); LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); LoraLinear const *lora = info->lora; - int rank = info->rank; + int rank = info->lora_config.rank; int num_dims = lora->inputs[0]->num_dims; int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree; int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; @@ -463,13 +475,14 @@ bool LoraLinear::measure_operator_cost(Simulator *sim, } bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid; + return lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type; } void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(this->op_type); } /* static */ @@ -480,13 +493,16 @@ Node LoraLinear::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 2); size_t id, transformer_layer_id, deserialized_model_id; + OperatorType op_type; dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); + dez.deserialize(op_type); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); LoraLinearParams params; params.layer_guid = layer_guid; + params.type = op_type; return ff.get_or_create_node({inputs[0], inputs[1]}, params); } @@ -500,6 +516,7 @@ Op *LoraLinear::materialize(FFModel &ff, LoraLinearParams LoraLinear::get_params() const { LoraLinearParams params; params.layer_guid = this->layer_guid; + params.type = this->op_type; return params; } diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc new file mode 100644 index 0000000000..80e7c6d64e --- /dev/null +++ b/src/ops/lora_linear_params.cc @@ -0,0 +1,20 @@ +#include "flexflow/ops/lora_linear_params.h" + +namespace FlexFlow { +const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig(); + +LoraLinearConfig::LoraLinearConfig() + : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f) {} + +LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr) + : rank(_rank), optimizer_type(_type), learning_rate(_lr) {} + +bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { + if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type && + lhs.learning_rate == rhs.learning_rate) { + return true; + } + return false; +} + +}; // namespace FlexFlow diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 47abcacd6a..3ee1ee62df 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -189,8 +189,10 @@ std::string get_operator_type_name(OperatorType type) { case OP_ARGMAX: return "ArgMax"; // PEFT Ops - case OP_LORA_LINEAR: - return "LoraLinear"; + case OP_LORA_MLP_FIRST: + return "Lora MLP First Layer"; + case OP_LORA_MLP_SECOND: + return "Lora MLP Second Layer"; // Parallel Ops case OP_REPARTITION: return "Repartition"; diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 5ca09db84b..b58990d32e 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2730,7 +2730,8 @@ void FFModel::deserialize_graph_optimal_view( node = Linear::deserialize(*this, dez, inputs, num_inputs); break; } - case OP_LORA_LINEAR: { + case OP_LORA_MLP_FIRST: + case OP_LORA_MLP_SECOND: { node = LoraLinear::deserialize(*this, dez, inputs, num_inputs); break; } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 931173e5f3..2bc1f30d07 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3228,7 +3228,8 @@ Op *FFModel::create_operator_from_layer( return op; } // PEFT layers - case OP_LORA_LINEAR: { + case OP_LORA_MLP_FIRST: + case OP_LORA_MLP_SECOND: { Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); return op; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 05eb3bb554..1f311b3b56 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1829,17 +1829,28 @@ std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { return "invalid_layer_name"; } -PEFTModelID FFModel::register_peft_model(std::map configs) { +bool is_peft_operator_type(OperatorType type) { + switch (type) { + case OP_LORA_MLP_FIRST: + case OP_LORA_MLP_SECOND: + return true; + default: + return false; + } +} + +PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, + LoraLinearConfig const mlp_second) { PEFTModelID peft_model_id(peft_model_global_guid++); InferenceManager *im = InferenceManager::get_inference_manager(); std::vector peft_operators; for (size_t op = 0; op < operators.size(); op++) { - if (operators[op]->op_type == OP_LORA_LINEAR) { + if (is_peft_operator_type(operators[op]->op_type)) { peft_operators.push_back(operators[op]); } else if (operators[op]->op_type == OP_FUSED) { FusedOp *fused = static_cast(operators[op]); for (size_t op2 = 0; op2 < fused->numOperators; op2++) { - if (fused->operators[op2]->op_type == OP_LORA_LINEAR) { + if (is_peft_operator_type(fused->operators[op2]->op_type)) { peft_operators.push_back(fused->operators[op2]); } } @@ -1849,12 +1860,37 @@ PEFTModelID FFModel::register_peft_model(std::map configs) { std::string layer_name = find_layer_name_from_guid(this, peft_operators[op]->layer_guid); switch (peft_operators[op]->op_type) { - case OP_LORA_LINEAR: { - // Remove the guid and the ``_'' char from opname: guid has 7 digits - // and ``_'' occupies 1 char - layer_name = layer_name.erase(layer_name.length() - 8); - assert(configs.find(layer_name) != configs.end()); - int rank = configs[layer_name]; + case OP_LORA_MLP_FIRST: { + if (mlp_first == LoraLinearConfig::DefaultConfig) { + // Do nothing for the default configuration + continue; + } + LoraLinear *lora = static_cast(peft_operators[op]); + // Currently assume only a single data pipeline + assert(config.data_parallelism_degree == 1); + std::vector inputs(lora->numInputs); + std::vector outputs(lora->numOutputs); + + for (int i = 0; i < lora->numInputs; i++) { + assert(im->tensor_buffer.find(lora->inputs[i]) != + im->tensor_buffer.end()); + assert(lora->inputs[i] != nullptr); + assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(im->tensor_buffer[lora->inputs[i]].size() == 1); + inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + assert(lora->numOutputs == 1); + outputs[0] = inputs[1]; + lora->register_peft_model( + *this, inputs, outputs, peft_model_id, mlp_first); + break; + } + case OP_LORA_MLP_SECOND: { + if (mlp_second == LoraLinearConfig::DefaultConfig) { + // Do nothing for the default configuration + continue; + } LoraLinear *lora = static_cast(peft_operators[op]); // Currently assume only a single data pipeline assert(config.data_parallelism_degree == 1); @@ -1872,7 +1908,8 @@ PEFTModelID FFModel::register_peft_model(std::map configs) { } assert(lora->numOutputs == 1); outputs[0] = inputs[1]; - lora->register_peft_model(*this, inputs, outputs, peft_model_id, rank); + lora->register_peft_model( + *this, inputs, outputs, peft_model_id, mlp_second); break; } default: { From 49e5664cade618ba7b93c73466efb5a2974ebc0e Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 11 Oct 2023 13:01:19 -0400 Subject: [PATCH 018/198] add an API for register peft request --- include/flexflow/request_manager.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index da64ac58a2..47627bc9fb 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -126,6 +126,10 @@ class RequestManager { RequestGuid register_new_request(std::vector const &prompt, int max_sequence_length, PEFTModelID peft_model_id); + RequestGuid register_new_peft_request( + std::vector> const &dataset, + int max_sequence_length, + PEFTModelID peft_model_id); bool is_request_completed(RequestGuid const &guid); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); From 008ffd9a180d3b82e6a0befdfd8f5d53202e766d Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 15 Oct 2023 16:19:39 -0400 Subject: [PATCH 019/198] format --- src/ops/tree_inc_multihead_self_attention.cpp | 2 +- src/ops/tree_inc_multihead_self_attention.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 05513ea2cc..e5bec2bc07 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -212,7 +212,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_infr_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch BatchConfig::max_sequence_length(), m->hidden_size); } diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index f63f59eae2..a6c4988ac8 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -211,7 +211,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_infr_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch BatchConfig::max_sequence_length(), m->hidden_size); } From ace7e3ff6f27a286d554a7c94c24d14d6d6525b8 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 17 Oct 2023 00:22:33 -0400 Subject: [PATCH 020/198] . --- config/config.linux | 2 +- include/flexflow/model.h | 1 + include/flexflow/ops/fused.h | 9 + .../flexflow/ops/kernels/softmax_kernels.h | 27 +- include/flexflow/ops/layer_norm.h | 10 +- include/flexflow/ops/softmax.h | 9 + include/flexflow/request_manager.h | 1 + inference/models/llama.cc | 3 +- src/ops/fused.cc | 61 + src/ops/fused.cu | 1339 ++++++++++++----- src/ops/kernels/softmax.cu | 188 ++- src/ops/layer_norm.cc | 12 +- src/ops/layer_norm.cu | 33 +- src/ops/softmax.cc | 67 +- src/runtime/inference_manager.cc | 50 + src/runtime/model.cc | 32 +- src/runtime/request_manager.cc | 1 + 17 files changed, 1311 insertions(+), 534 deletions(-) diff --git a/config/config.linux b/config/config.linux index 3686237538..dbf3d3dd01 100755 --- a/config/config.linux +++ b/config/config.linux @@ -13,7 +13,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=${BUILD_TYPE:-Release} +BUILD_TYPE=${BUILD_TYPE:-Debug} INFERENCE_TESTS=${INFERENCE_TESTS:-OFF} LIBTORCH_PATH=${LIBTORCH_PATH:-"$(realpath ../..)/libtorch"} diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 8d6dd87e91..faf969efb7 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -182,6 +182,7 @@ enum TaskIDs { TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, + FUSEDOP_PEFT_BWD_TASK_ID, FUSEDOP_FWD_TASK_ID, FUSEDOP_BWD_TASK_ID, FUSEDOP_INF_TASK_ID, diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index 87c2201c28..ffafa97915 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -40,6 +40,11 @@ class FusedOp : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -51,6 +56,10 @@ class FusedOp : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 8cfaf3c586..339d8ebc53 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -28,16 +28,24 @@ class SoftmaxMeta : public OpMeta { namespace Kernels { namespace Softmax { -template + void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr); -template + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + void backward_kernel_wrapper(SoftmaxMeta const *m, - DT *input_grad_ptr, - DT const *output_grad_ptr, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad, size_t num_elements); +void inference_kernel_wrapper(SoftmaxMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + namespace Internal { template void forward_kernel(SoftmaxMeta const *m, @@ -50,6 +58,13 @@ void backward_kernel(DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, ffStream_t stream); + +template +void inference_kernel(SoftmaxMeta const *m, + DT const *input_ptr, + DT *output_ptr, + ffStream_t stream); + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 389b3e718a..d5ab51bbf8 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -118,17 +118,17 @@ class LayerNorm : public Op { T const *gamma_ptr, T *gamma_grad_ptr, T *beta_grad_ptr); + + static void peft_bwd_kernel_wrapper(LayerNormMeta const *m, + GenericTensorAccessorW const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma); template static void peft_bwd_kernel(LayerNormMeta const *m, T const *output_grad_ptr, T *input_grad_ptr, T const *gamma_ptr, ffStream_t stream); - template - static void peft_bwd_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T *input_grad_ptr, - T const *gamma_ptr); public: bool elementwise_affine, use_bias; diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 6fd1a434d4..5e94c5626c 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -32,6 +32,11 @@ class Softmax : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; void print_layer(FFModel const &model) override { @@ -57,6 +62,10 @@ class Softmax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 47627bc9fb..5aab9781c8 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -37,6 +37,7 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); + void peft_bwd(FFModel *model, int index, BatchConfigFuture const &bc); void load_input_tokens_from_batch_config(BatchConfigFuture const &bc, ParallelTensor const input); void load_positions(BatchConfigFuture const &bc, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 20e1f38ce9..72641161d1 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -257,7 +257,8 @@ void LLAMA::create_llama_model(FFModel &ff, output = ff.sampling(softmax, generation_config.topp); } else { // output = ff.arg_top_k(dense, /*k=*/1, false); - output = ff.argmax(dense, /*beam_Search*/ false); + Tensor softmax = ff.softmax(dense, -1); + output = ff.argmax(softmax, /*beam_Search*/ false); } } diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 70650aef0d..8964f0063d 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -531,6 +531,67 @@ FutureMap FusedOp::inference(FFModel const &ff, return runtime->execute_index_space(ctx, launcher); } +FutureMap FusedOp::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + // Set iter_config + iter_config = ff.iter_config; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig + // so we transfer the maximum of them + // size_t batch_config_size = + // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); + IndexLauncher launcher(FUSEDOP_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int offset = 0; + for (int i = 0; i < numInputs; i++) { + assert(inputs[i]->part != LogicalPartition::NO_PART); + assert(inputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numInputs; + for (int i = 0; i < numWeights; i++) { + assert(weights[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement(RegionRequirement(weights[i]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numWeights; + for (int i = 0; i < numOutputs; i++) { + assert(outputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + void FusedOp::backward(FFModel const &ff) { // Set iter_config iter_config = ff.iter_config; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index f404e305e6..64fe331400 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -78,13 +78,21 @@ OpMeta *FusedOp::init_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void FusedOp::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void + FusedOp::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; + // BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active tokens + if (bc->num_tokens == 0) { + return; + } + assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); assert((int)regions.size() == @@ -174,10 +182,11 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -192,21 +201,6 @@ __host__ void FusedOp::forward_task(Task const *task, m->legion_axis); break; } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_accessor[1].get_float_ptr()); - break; - } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -222,16 +216,6 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[1].get_float_ptr()); break; } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - break; - } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -242,25 +226,49 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - float const *bias_ptr = nullptr; + void const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].get_float_ptr(); + bias_ptr = my_weight_accessor[1].ptr; } } else { assert(fused->op_num_weights[op] == 1); } - Kernels::Linear::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - bias_ptr, - in_dim, - out_dim, - batch_size); + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_infr_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_LORA_MLP_FIRST: + case OP_LORA_MLP_SECOND: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_accessor[0].domain; + Domain output_domain = my_output_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->output_type[0] == my_output_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); + Kernels::LoraLinear::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } case OP_BATCHMATMUL: { @@ -388,88 +396,126 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } break; } - case OP_POOL2D: { + case OP_RMS_NORM: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::forward_kernel_wrapper(m, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } - case OP_SOFTMAX: { + case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + IncMultiHeadSelfAttentionMeta const *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_RESHAPE: { + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + // TreeVerifyBatchConfig const *tree_bc = + // (TreeVerifyBatchConfig *)task->args; + TreeVerifyBatchConfig const &tree_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + &tree_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_TRANSPOSE: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + // BeamSearchBatchConfig const *beam_bc = + // (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &beam_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); + &beam_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } case OP_LAYERNORM: { @@ -491,23 +537,119 @@ __host__ void FusedOp::forward_task(Task const *task, break; } case OP_RESIDUAL_LAYERNORM: { - assert(false && "Operator ResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(false && "Operator AddBiasResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_SIGMOID_SILU_MULTI: { - assert(false && "Operator SigmoidSiluMulti does not support " - "the forward() task"); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(false && "Operator ResidualRMSNorm does not support " - "the forward() task"); + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + Domain attn_bias_domain = my_weight_accessor[0].domain; + Domain residual_domain = my_input_accessor[1].domain; + int attn_bias_dim = + attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; + int residual_volume = residual_domain.get_volume(); + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + attn_bias_dim, + residual_volume, + my_input_accessor[0], + my_output_accessor[0], + my_output_accessor[1], + my_input_accessor[1], + my_weight_accessor[0], + gamma, + beta); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::inference_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_HALF) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr()); + } else if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } default: { @@ -517,6 +659,37 @@ __host__ void FusedOp::forward_task(Task const *task, assert(false && "Fusion currently does not support type"); } } + if (metas->meta[op]->inference_debugging) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + input_accessors_to_save.push_back(input_accessor[my_off]); + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + input_accessors_to_save.push_back(output_accessor[my_off]); + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + weight_accessors_to_save.push_back( + weight_accessor[fused->op_weight_idx[i + woff]]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(output_accessor[i + ooff]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); + } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; @@ -531,18 +704,17 @@ __host__ void FusedOp::forward_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void - FusedOp::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void FusedOp::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); // Return if no active tokens - if (bc->num_tokens == 0) { + if (bc->num_active_tokens() == 0) { return; } @@ -553,15 +725,15 @@ __host__ void // Domain input_domain[MAX_NUM_INPUTS]; // Domain weight_domain[MAX_NUM_WEIGHTS]; // Domain output_domain[MAX_NUM_OUTPUTS]; - GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { // input_domain[i] = runtime->get_index_space_domain( // ctx, task->regions[i].region.get_index_space()); - input_accessor[i] = - helperGetGenericTensorAccessorRO(fused->input_data_types[i], + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], regions[i], task->regions[i], FID_DATA, @@ -586,8 +758,8 @@ __host__ void for (int i = 0; i < fused->numOutputs; i++) { // output_domain[i] = runtime->get_index_space_domain( // ctx, task->regions[i + roff].region.get_index_space()); - output_accessor[i] = - helperGetGenericTensorAccessorWO(fused->output_data_types[i], + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, @@ -609,21 +781,32 @@ __host__ void } int ioff = 0, woff = 0, ooff = 0; + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + + // Do backpropagation in the reverse ordering for (int op = 0; op < fused->numOperators; op++) { - // Domain my_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS]; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + + for (int op = fused->numOperators - 1; op >= 0; op--) { + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { // my_id[i] = input_domain[my_off]; - my_input_accessor[i] = input_accessor[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { // my_id[i] = output_domain[my_off]; - my_input_accessor[i] = output_accessor[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; } else { assert(false); } @@ -639,7 +822,7 @@ __host__ void assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; - my_output_accessor[i] = output_accessor[my_off]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -647,26 +830,31 @@ __host__ void assert(fused->op_num_outputs[op] == 1); ConcatMeta *m = (ConcatMeta *)metas->meta[op]; int num_inputs = fused->op_num_inputs[op]; - Kernels::Concat::forward_kernel_wrapper(m, - my_output_accessor[0], - my_input_accessor, - num_inputs, - m->legion_axis); + // TODO: implement this + assert(false); + // Kernels::Concat::peft_bwd_kernel_wrapper(m, + // my_output_accessor[0], + // my_input_accessor, + // num_inputs, + // m->legion_axis); break; } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); + assert(my_input_grad_accessor[0].domain.get_dim() == 5); + assert(my_output_grad_accessor[0].domain.get_dim() == 5); assert(my_weight_accessor[0].domain.get_dim() == 2); assert(my_weight_accessor[1].domain.get_dim() == 2); - BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - BatchNorm::forward_kernel(m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_accessor[1].get_float_ptr()); + // TODO: implement this + assert(false); + // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + // BatchNorm::peft_bwd_kernel_kernel( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_weight_accessor[0].get_float_ptr(), + // my_weight_accessor[1].get_float_ptr()); break; } case OP_LINEAR: { @@ -675,10 +863,11 @@ __host__ void Domain kernel_domain = my_weight_accessor[0].domain; int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; - int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; - assert(my_output_accessor[0].domain.get_volume() == + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); void const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { @@ -689,48 +878,50 @@ __host__ void } else { assert(fused->op_num_weights[op] == 1); } - assert(m->input_type[0] == my_input_accessor[0].data_type); - assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_infr_tokens(); - Kernels::Linear::forward_kernel_wrapper(m, - my_input_accessor[0].ptr, - my_output_accessor[0].ptr, - my_weight_accessor[0].ptr, - bias_ptr, - in_dim, - out_dim, - batch_size); + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->input_type[0] == my_output_grad_accessor[0].data_type); + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + Kernels::Linear::peft_bwd_kernel_wrapper(m, + my_input_grad_accessor[0].ptr, + my_output_grad_accessor[0].ptr, + my_weight_accessor[0].ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); break; } case OP_LORA_MLP_FIRST: case OP_LORA_MLP_SECOND: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); - Domain input_domain = my_input_accessor[0].domain; - Domain output_domain = my_output_accessor[0].domain; + Domain input_domain = my_input_grad_accessor[0].domain; + Domain output_domain = my_output_grad_accessor[0].domain; int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; - int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; - assert(my_output_accessor[0].domain.get_volume() == + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == out_dim * batch_size); - assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; - assert(m->input_type[0] == my_input_accessor[0].data_type); - assert(m->output_type[0] == my_output_accessor[0].data_type); + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->output_type[0] == my_output_grad_accessor[0].data_type); // Assert that the output and the second input are at the same place // since we ``inplace'' the output for LoRA - assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); - Kernels::LoraLinear::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); + Kernels::LoraLinear::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; } case OP_BATCHMATMUL: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_accessor[0].domain; - Domain a_domain = my_input_accessor[0].domain; - Domain b_domain = my_input_accessor[1].domain; + Domain out_domain = my_output_grad_accessor[0].domain; + Domain a_domain = my_input_grad_accessor[0].domain; + Domain b_domain = my_input_grad_accessor[1].domain; int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; @@ -746,20 +937,22 @@ __host__ void assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); batch *= dim_size; } - BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - Kernels::BatchMatmul::forward_kernel_wrapper( - meta, - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].get_float_ptr(), - my_input_accessor[1].get_float_ptr(), - (float const *)nullptr, - m, - n, - k, - batch, - meta->a_seq_length_dim, - meta->b_seq_length_dim, - fused->iter_config.seq_length); + // TODO: implement me + assert(false); + // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + // Kernels::BatchMatmul::backward_kernel_wrapper( + // meta, + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].get_float_ptr(), + // my_input_accessor[1].get_float_ptr(), + // (float const *)nullptr, + // m, + // n, + // k, + // batch, + // meta->a_seq_length_dim, + // meta->b_seq_length_dim, + // fused->iter_config.seq_length); break; } case OP_EW_ADD: @@ -771,78 +964,20 @@ __host__ void assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_input_accessor[1].domain); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; - Kernels::ElementBinary::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_output_accessor[0]); + assert(my_input_grad_accessor[0].domain == + my_input_grad_accessor[1].domain); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + // Kernels::ElementBinary::forward_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // my_output_accessor[0]); break; } case OP_EMBEDDING: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 1); - EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; - if (m->aggr == AGGR_MODE_NONE) { - // assert(kernel_domain.get_dim() == 2); - assert(my_input_accessor[0].domain.get_dim() + 1 == - my_output_accessor[0].domain.get_dim()); - for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { - assert(my_input_accessor[0].domain.hi()[i] == - my_output_accessor[0].domain.hi()[i + 1]); - assert(my_input_accessor[0].domain.lo()[i] == - my_output_accessor[0].domain.lo()[i + 1]); - } - assert(my_weight_accessor[0].domain.hi()[0] - - my_weight_accessor[0].domain.lo()[0] == - my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0]); - } else { - assert(my_input_accessor[0].domain.get_dim() == - my_output_accessor[0].domain.get_dim()); - for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { - assert(my_input_accessor[0].domain.hi()[i] == - my_output_accessor[0].domain.hi()[i]); - assert(my_input_accessor[0].domain.lo()[i] == - my_output_accessor[0].domain.lo()[i]); - } - assert(my_weight_accessor[0].domain.hi()[0] - - my_weight_accessor[0].domain.lo()[0] == - my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0]); - } - int in_dim, out_dim, effective_batch_size; - if (m->aggr == AGGR_MODE_NONE) { - in_dim = 1; - out_dim = my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } else { - assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); - in_dim = my_input_accessor[0].domain.hi()[0] - - my_input_accessor[0].domain.lo()[0] + 1; - out_dim = my_output_accessor[0].domain.hi()[0] - - my_output_accessor[0].domain.lo()[0] + 1; - effective_batch_size = - my_output_accessor[0].domain.get_volume() / out_dim; - assert(effective_batch_size * in_dim == - my_input_accessor[0].domain.get_volume()); - } - - assert(my_input_accessor[0].data_type == DT_INT32 || - my_input_accessor[0].data_type == DT_INT64); - Kernels::Embedding::forward_kernel_wrapper(m, - my_input_accessor[0], - my_output_accessor[0], - my_weight_accessor[0], - in_dim, - out_dim, - effective_batch_size); + // Currently assume the Embedding layer cannot be finetuned + // so we do nothing for embedding break; } case OP_GELU: @@ -854,23 +989,26 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - if (m->data_type == DT_HALF) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr(), - my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_FLOAT) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false && "Unsupported data type in ElementUnary forward"); - } + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // TODO: implement me + assert(false); + // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + // if (m->data_type == DT_HALF) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_half_ptr(), + // my_output_accessor[0].get_half_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else if (m->data_type == DT_FLOAT) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else { + // assert(false && "Unsupported data type in ElementUnary forward"); + // } break; } case OP_RMS_NORM: { @@ -878,23 +1016,26 @@ __host__ void assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); + Kernels::RMSNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_weight_accessor[0]); break; } case OP_RESIDUAL_RMS_NORM: { + // TODO: implement me + assert(false); assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + // ResidualRMSNormMeta const *m = (ResidualRMSNormMeta + // *)metas->meta[op]; + // Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // my_weight_accessor[0], + // my_output_accessor[0], + // my_output_accessor[1]); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -909,66 +1050,20 @@ __host__ void assert(fused->op_num_weights[op] == 2); biases = my_weight_accessor[1]; } - IncMultiHeadSelfAttention::inference_kernel_wrapper( + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( m, bc, task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); - break; - } - case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - TreeIncMultiHeadSelfAttentionMeta *m = - (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // TreeVerifyBatchConfig const *tree_bc = - // (TreeVerifyBatchConfig *)task->args; - TreeVerifyBatchConfig const &tree_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &tree_bc, - task->index_point.point_data[0], - my_input_accessor[0], + my_input_grad_accessor[0], my_weight_accessor[0], - my_output_accessor[0], + my_output_grad_accessor[0], biases); break; } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - SpecIncMultiHeadSelfAttentionMeta const *m = - (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // BeamSearchBatchConfig const *beam_bc = - // (BeamSearchBatchConfig *)task->args; - BeamSearchBatchConfig const &beam_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &beam_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + // TODO: implement me + assert(false); break; } case OP_LAYERNORM: { @@ -985,8 +1080,8 @@ __host__ void beta = my_weight_accessor[1]; } } - LayerNorm::forward_kernel_wrapper( - m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + LayerNorm::peft_bwd_kernel_wrapper( + m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma); break; } case OP_RESIDUAL_LAYERNORM: { @@ -1009,7 +1104,7 @@ __host__ void } GenericTensorAccessorR residual2; if (m->use_two_residuals) { - residual2 = my_input_accessor[2]; + residual2 = my_input_grad_accessor[2]; } GenericTensorAccessorR gamma, beta; if (m->elementwise_affine) { @@ -1018,14 +1113,16 @@ __host__ void beta = my_weight_accessor[1]; } } - ResidualLayerNorm::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - residual2, - my_output_accessor[0], - my_output_accessor[1], - gamma, - beta); + // TODO: implment me + assert(false); + // ResidualLayerNorm::inference_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // residual2, + // my_output_accessor[0], + // my_output_accessor[1], + // gamma, + // beta); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { @@ -1050,59 +1147,55 @@ __host__ void } } Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; + Domain residual_domain = my_input_grad_accessor[1].domain; int attn_bias_dim = attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; int residual_volume = residual_domain.get_volume(); - AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - attn_bias_dim, - residual_volume, - my_input_accessor[0], - my_output_accessor[0], - my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], - gamma, - beta); + // TODO: implement me + assert(false); + // AddBiasResidualLayerNorm::inference_kernel_wrapper( + // m, + // attn_bias_dim, + // residual_volume, + // my_input_accessor[0], + // my_output_accessor[0], + // my_output_accessor[1], + // my_input_accessor[1], + // my_weight_accessor[0], + // gamma, + // beta); break; } case OP_SIGMOID_SILU_MULTI: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; - SigmoidSiluMulti::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_output_accessor[0]); + // SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta + // *)metas->meta[op]; + // TODO: implement me + assert(false); + // SigmoidSiluMulti::inference_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // my_output_accessor[0]); break; } case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } + Kernels::Softmax::peft_bwd_kernel_wrapper( + m, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; } case OP_ALLREDUCE: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + Kernels::AllReduce::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; } default: { @@ -1112,36 +1205,458 @@ __host__ void assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging) { - std::vector input_accessors_to_save; - std::vector weight_accessors_to_save; - std::vector output_accessors_to_save; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - input_accessors_to_save.push_back(input_accessor[my_off]); - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - input_accessors_to_save.push_back(output_accessor[my_off]); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back( - weight_accessor[fused->op_weight_idx[i + woff]]); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); - } - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - FusedOp::save_inference_tensors_to_file(metas->meta[op], - shard_id, - bc, - input_accessors_to_save, - weight_accessors_to_save, - output_accessors_to_save); + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); +} + +/* + regions[...](I): inputs + regions[...](I): weights + regions[...](O): outputs +*/ +__host__ void FusedOp::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); + // Domain input_domain[MAX_NUM_INPUTS]; + // Domain weight_domain[MAX_NUM_WEIGHTS]; + // Domain output_domain[MAX_NUM_OUTPUTS]; + GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + // input_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i].region.get_index_space()); + input_accessor[i] = + helperGetGenericTensorAccessorRO(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + int roff = fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + // weight_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + // output_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + output_accessor[i] = + helperGetGenericTensorAccessorWO(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + int ioff = 0, woff = 0, ooff = 0; + for (int op = 0; op < fused->numOperators; op++) { + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + // my_id[i] = input_domain[my_off]; + my_input_accessor[i] = input_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + // my_id[i] = output_domain[my_off]; + my_input_accessor[i] = output_accessor[my_off]; + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; + // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; + // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; + my_output_accessor[i] = output_accessor[i + ooff]; + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + int num_inputs = fused->op_num_inputs[op]; + Kernels::Concat::forward_kernel_wrapper(m, + my_output_accessor[0], + my_input_accessor, + num_inputs, + m->legion_axis); + break; + } + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + BatchNorm::forward_kernel(m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + float const *bias_ptr = nullptr; + LinearMeta *m = (LinearMeta *)metas->meta[op]; + if (fused->op_num_weights[op] == 2) { + assert(my_weight_accessor[1].domain.get_volume() == out_dim); + if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { + bias_ptr = my_weight_accessor[1].get_float_ptr(); + } + } else { + assert(fused->op_num_weights[op] == 1); + } + Kernels::Linear::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_accessor[0].domain; + Domain a_domain = my_input_accessor[0].domain; + Domain b_domain = my_input_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + Kernels::BatchMatmul::forward_kernel_wrapper( + meta, + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].get_float_ptr(), + my_input_accessor[1].get_float_ptr(), + (float const *)nullptr, + m, + n, + k, + batch, + meta->a_seq_length_dim, + meta->b_seq_length_dim, + fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_input_accessor[1].domain); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + Kernels::ElementBinary::forward_kernel_wrapper(m, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_EMBEDDING: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + EmbeddingMeta *m = (EmbeddingMeta *)metas->meta[op]; + if (m->aggr == AGGR_MODE_NONE) { + // assert(kernel_domain.get_dim() == 2); + assert(my_input_accessor[0].domain.get_dim() + 1 == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 0; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i + 1]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i + 1]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } else { + assert(my_input_accessor[0].domain.get_dim() == + my_output_accessor[0].domain.get_dim()); + for (size_t i = 1; i < my_input_accessor[0].domain.get_dim(); i++) { + assert(my_input_accessor[0].domain.hi()[i] == + my_output_accessor[0].domain.hi()[i]); + assert(my_input_accessor[0].domain.lo()[i] == + my_output_accessor[0].domain.lo()[i]); + } + assert(my_weight_accessor[0].domain.hi()[0] - + my_weight_accessor[0].domain.lo()[0] == + my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0]); + } + int in_dim, out_dim, effective_batch_size; + if (m->aggr == AGGR_MODE_NONE) { + in_dim = 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } else { + assert(m->aggr == AGGR_MODE_AVG || m->aggr == AGGR_MODE_SUM); + in_dim = my_input_accessor[0].domain.hi()[0] - + my_input_accessor[0].domain.lo()[0] + 1; + out_dim = my_output_accessor[0].domain.hi()[0] - + my_output_accessor[0].domain.lo()[0] + 1; + effective_batch_size = + my_output_accessor[0].domain.get_volume() / out_dim; + assert(effective_batch_size * in_dim == + my_input_accessor[0].domain.get_volume()); + } + + assert(my_input_accessor[0].data_type == DT_INT32 || + my_input_accessor[0].data_type == DT_INT64); + Kernels::Embedding::forward_kernel_wrapper(m, + my_input_accessor[0], + my_output_accessor[0], + my_weight_accessor[0], + in_dim, + out_dim, + effective_batch_size); + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + if (m->input_type == DT_HALF) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr()); + } else if (m->input_type == DT_FLOAT) { + Kernels::Softmax::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + } + break; + } + case OP_RESHAPE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_TRANSPOSE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0], gamma, beta); + break; + } + case OP_RESIDUAL_LAYERNORM: { + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(false && "Operator SigmoidSiluMulti does not support " + "the forward() task"); + break; + } + case OP_RESIDUAL_RMS_NORM: { + assert(false && "Operator ResidualRMSNorm does not support " + "the forward() task"); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e47006cc9d..69f98d5e5a 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -40,10 +40,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, namespace Kernels { namespace Softmax { -template void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -52,7 +51,15 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + if (m->output_type[0] == DT_FLOAT) { + Internal::forward_kernel( + m, input.get_float_ptr(), output.get_float_ptr(), stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::forward_kernel( + m, input.get_half_ptr(), output.get_half_ptr(), stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -99,21 +106,78 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); -template void forward_kernel_wrapper(SoftmaxMeta const *m, - half const *input_ptr, - half *output_ptr); - -template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements); -template void backward_kernel_wrapper(SoftmaxMeta const *m, - half *input_grad_ptr, - half const *output_grad_ptr, - size_t num_elements); +void inference_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->output_type[0] == DT_FLOAT) { + Internal::inference_kernel( + m, bc, input.get_float_ptr(), output.get_float_ptr(), stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::inference_kernel( + m, bc, input.get_half_ptr(), output.get_half_ptr(), stream); + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->output_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel( + m, bc, input_grad.get_float_ptr(), output_grad.get_float_ptr(), stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::inference_kernel( + m, bc, input_grad.get_half_ptr(), output_grad.get_half_ptr(), stream); + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + namespace Internal { template void forward_kernel(SoftmaxMeta const *m, @@ -146,6 +210,92 @@ void backward_kernel(DT *input_grad_ptr, stream)); } +template +void inference_kernel(SoftmaxMeta const *m, + DT const *input_ptr, + DT *output_ptr, + int num_tokens, + int num_classes, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + num_tokens, + num_classes, + 1, + 1)); + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->outputTensor, + input_ptr, + &beta, + m->outputTensor, + output_ptr)); +} + +template +__global__ void sparse_categorical_crossentropy_loss_peft_backward( + DT *input_grad, + DT const *output_grad, + BatchConfig::TokenId const *token_ids, + int num_tokens, + int num_classes) { + CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { + input_grad[i] = output_grad[i]; + if (i % num_classes == token_ids[i / num_classes]) { + input_grad[i] -= 1.0f; + } + } +} + +template +void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + cudaStream_t stream) { + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (!bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch; + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = bc->tokensInfo[j + tokens_previous_requests].token_id; + } + checkCUDA(cudaMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenID) * num_bwd_tokens, + cudaMemcpyHostToDevice, + stream)); + sparse_categorical_crossentropy_loss_peft_backward<<< + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream>>>(input_grad_ptr + tokens_previous_requests * num_classes, + output_grad_ptr + tokens_previous_requests * num_classes, + token_ids, + num_bwd_tokens, + num_classes); + + tokens_previous_requests += num_bwd_tokens; + } + assert(tokens_previous_requests == bc->num_active_tokens()); +} + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index b5ee66fdba..b0d196a7c4 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -734,17 +734,7 @@ void LayerNorm::peft_bwd_task(Task const *task, } else { assert(regions.size() == 2); } - if (m->output_type[0] == DT_FLOAT) { - LayerNorm::peft_bwd_kernel_wrapper(m, - output_grad.get_float_ptr(), - input_grad.get_float_ptr(), - gamma.get_float_ptr()); - } else { - LayerNorm::peft_bwd_kernel_wrapper(m, - output_grad.get_half_ptr(), - input_grad.get_half_ptr(), - gamma.get_half_ptr()); - } + LayerNorm::peft_bwd_kernel_wrapper(m, output_grad, input_grad, gamma); } void LayerNorm::backward(FFModel const &ff) { diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index cdf2ed433f..3d828362dd 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -694,15 +694,26 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, } /*static*/ -template -void LayerNorm::peft_bwd_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T *input_grad_ptr, - T const *gamma_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - LayerNorm::peft_bwd_kernel( - m, output_grad_ptr, input_grad_ptr, gamma_ptr, stream); +void LayerNorm::peft_bwd_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorW const &output_grad, + GenericTensorAccessorR const &input_grad, + GenericTensorAccessorW const &gamma) cudaStream_t stream; +checkCUDA(get_legion_stream(&stream)); +if (m->output_type[0] == DT_FLOAT) { + LayerNorm::peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + stream); +} else { + assert(m->output_type[0] == DT_HALF); + LayerNorm::peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + stream); +} } /*static*/ @@ -753,5 +764,5 @@ template void half const *output_grad_ptr, half *input_grad_ptr, half const *gamma_ptr); - -}; // namespace FlexFlow +} +; // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index ba0a1288d6..8d4a1f64b4 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -315,13 +315,7 @@ void Softmax::forward_task(Task const *task, GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); - if (m->output_type == DT_HALF) { - forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); - } else if (m->output_type == DT_FLOAT) { - forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else { - assert(false && "Unsupported data type"); - } + forward_kernel_wrapper(m, input, output); } void Softmax::backward(FFModel const &ff) { @@ -359,52 +353,11 @@ void Softmax::backward_task(Task const *task, Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - switch (in_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - if (m->output_type == DT_HALF) { \ - return backward_task_with_dim(task, regions, ctx, runtime); \ - } else if (m->output_type == DT_FLOAT) { \ - return backward_task_with_dim(task, regions, ctx, runtime); \ - } else { \ - assert(false && "Unsupported data type"); \ - } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); - } -} - -/* - regions[0](I/O): input_grad - regions[1](I): output_grad -*/ -// Note that the backward task of softmax is actually a no op (i.e., input_grad -// = output_grad) since the upstream cross_entropy_loss function computes -// performs softmax_cross_entropy_loss to avoid intermediate zeros -template -void Softmax::backward_task_with_dim(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Softmax* softmax = (Softmax*) task->args; - SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - TensorAccessorW acc_input_grad(regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - TensorAccessorR acc_output_grad( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - // make sure the image indices match! - assert(acc_input_grad.rect == acc_output_grad.rect); - - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorWR( + m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + backward_kernel_wrapper(m, input_grad, output_grad); } void Softmax::inference_task(Task const *task, @@ -425,13 +378,7 @@ void Softmax::inference_task(Task const *task, m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); - if (m->output_type == DT_HALF) { - forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); - } else if (m->output_type == DT_FLOAT) { - forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else { - assert(false && "Unsupported data type"); - } + inference_kernel_wrapper(m, input, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 199b94c72c..5d81fa4664 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -369,6 +369,56 @@ FutureMap InferenceManager::inference(FFModel *model, return fm; }; +void InferenceManager::peft_bwd(FFModel *model, + int index, + BatchConfigFuture const &bc) { + int batch_index = index % model->config.data_parallelism_degree; + FutureMap fm; + bool found_input_operator = false; + int last_op = model->operators.size() - 1; + // Assert that the last operator must be argmax or sampling + assert(model->operators[last_op]->op_type == OP_ARGMAX || + model->operators[last_op]->op_type == OP_SAMPLING); + last_op -= 1; + while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { + last_op -= 1; + } + // Assert that the previous operator must be softmax + assert(model->operators[last_op]->op_type == OP_SOFTMAX || + model->operators[last_op]->op_type == OP_FUSED); + if (model->operators[last_op]->op_type == OP_FUSED) { + FusedOp *fused_op = static_cast(model->operators[last_op]); + assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX); + } + for (int o = last_op; o >= 0; o--) { + Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } + std::vector inputs(op->numInputs); + std::vector outputs(op->numOutputs); + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i] != nullptr); + assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->inputs[i]].size() > batch_index); + inputs[i] = tensor_buffer[op->inputs[i]][batch_index]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i] != nullptr); + assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + if (op->op_type == OP_INPUT && + tensor_buffer[op->outputs[i]].size() == 0) { + continue; + } + assert(tensor_buffer[op->outputs[i]].size() > batch_index); + outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + op->peft_bwd(*model, bc, inputs, outputs); + } +}; + void InferenceManager::load_input_tokens_from_batch_config( BatchConfigFuture const &bc, ParallelTensor const input) { Context ctx = ff_config.lg_ctx; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2bc1f30d07..69a7f3786f 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6385,31 +6385,47 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); + TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "FusedOp Forward Task"); + Runtime::preregister_task_variant( + registrar, "FusedOp Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } { - TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); + TaskVariantRegistrar registrar(FUSEDOP_PEFT_BWD_TASK_ID, + "FusedOp PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "FusedOp Inference Task"); + Runtime::preregister_task_variant( + registrar, "FusedOp PEFT Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); + } + } + + { + TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "FusedOp Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); } } { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 16f7a44e07..024c8f11ce 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1961,6 +1961,7 @@ GenerationResult BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second); FutureMap fm = im->inference(llm, 0, bcf); + im->peft_bwd(llm, 0, bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); batch_pipeline.push(std::make_pair(bcf, irf)); From 6bbb81e3f5aa0e4e01bea75c9090d40f890230b3 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 17 Oct 2023 11:23:56 -0400 Subject: [PATCH 021/198] variable renaming --- include/flexflow/batch_config.h | 2 +- include/flexflow/request_manager.h | 2 +- src/ops/inc_multihead_self_attention.cpp | 2 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/spec_inc_multihead_self_attention.cpp | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 2 +- src/runtime/batch_config.cc | 6 +- src/runtime/beam_search_batch_config.cc | 4 +- src/runtime/request_manager.cc | 66 +++++++++++-------- src/runtime/tree_verify_batch_config.cc | 4 +- 10 files changed, 50 insertions(+), 42 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 108bc8d172..25bc206bf9 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -61,7 +61,7 @@ class BatchConfig { int num_tokens; struct PerRequestInfo { - int token_start_offset; + int first_token_depth_in_request; int num_tokens_in_batch; int max_sequence_length; RequestGuid request_guid; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 3081aaa1c2..baf6844801 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -154,7 +154,7 @@ class RequestManager { std::vector> traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, - int token_start_offset); + int first_token_depth_in_request); // remove guid after put the cached tree in request std::vector> merge_dfs_trees( diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 562898a220..37cc986f5e 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -532,7 +532,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 00d45a9cfa..6ec077c328 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -531,7 +531,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 173d4a5b1d..1d81ae0c11 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -231,7 +231,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // int total_tokens = bc->token_last_available_idx[i] + 1; int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; // Compute (QK^T/sqrt(d_k)) int m_ = num_new_tokens; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 00eec96824..8b89acf3b7 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -248,7 +248,7 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // int total_tokens = bc->token_last_available_idx[i] + 1; int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].token_start_offset + + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; if (num_new_tokens <= 0) { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 72572c4e06..4781f09cab 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -27,7 +27,7 @@ using Legion::Memory; BatchConfig::BatchConfig() : num_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { - requestsInfo[i].token_start_offset = 0; + requestsInfo[i].first_token_depth_in_request = 0; requestsInfo[i].num_tokens_in_batch = 0; request_completed[i] = true; } @@ -104,8 +104,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " << bc.requestsInfo[i].token_start_offset - << std::endl; + os << " Token start offset: " + << bc.requestsInfo[i].first_token_depth_in_request << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 811ef00ba2..f785dc5b74 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -126,8 +126,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " << bc.requestsInfo[i].token_start_offset - << std::endl; + os << " Token start offset: " + << bc.requestsInfo[i].first_token_depth_in_request << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index b5688c07e6..1c5a6ae5da 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -367,7 +367,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; - new_bc.requestsInfo[i].token_start_offset = 0; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens - @@ -382,7 +382,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, profile_info.start_time = Realm::Clock::current_time_in_microseconds(); profiling_requests[new_request.guid] = profile_info; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < new_request.tokens.size()); @@ -397,8 +397,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].token_start_offset + - old_bc.requestsInfo[i].num_tokens_in_batch; + int processed_tokens = + old_bc.requestsInfo[i].first_token_depth_in_request + + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); bool request_completed = false; // printf("model_type = %d\n", this->model_type); @@ -464,12 +465,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; - if (new_bc.requestsInfo[i].token_start_offset + 1 == + if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == request.tokens.size()) { // Incremental phase new_bc.requestsInfo[i].num_tokens_in_batch = 1; @@ -478,10 +479,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)request.tokens.size() - - new_bc.requestsInfo[i].token_start_offset); + new_bc.requestsInfo[i].first_token_depth_in_request); } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < request.tokens.size()); @@ -685,7 +686,7 @@ BeamSearchBatchConfig new_bc.request_running[i] = true; // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = + new_bc.requestsInfo[i].first_token_depth_in_request = verified_tokens.front().second; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; @@ -694,9 +695,10 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); // TODO: Beam Request Info, missing from VerifyTreeBatchConfig - int new_max_depth = new_bc.requestsInfo[i].max_sequence_length - - new_bc.requestsInfo[i].token_start_offset - - verified_tokens.size(); + int new_max_depth = + new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].first_token_depth_in_request - + verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; new_bc.beamRequestsInfo[i].beam_size = BeamSearchBatchConfig::MAX_BEAM_WIDTH; @@ -742,7 +744,8 @@ BeamSearchBatchConfig assert(request.ssm_cache_size == request.initial_len); // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = request.ssm_cache_size; + new_bc.requestsInfo[i].first_token_depth_in_request = + request.ssm_cache_size; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -776,7 +779,7 @@ BeamSearchBatchConfig Request new_request = pending_request_queue.front(); pending_request_queue.pop(); // all_requests[new_request.guid] = new_request; - new_bc.requestsInfo[i].token_start_offset = 0; + new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].request_guid = new_request.guid; new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, @@ -806,7 +809,7 @@ BeamSearchBatchConfig new_bc.sub_requests[i] = 1; for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < new_request.tokens.size()); @@ -922,7 +925,7 @@ BeamSearchBatchConfig // zero when beam search has reached required sequence length // assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - int processed_tokens = old_bc.requestsInfo[i].token_start_offset + + int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request + old_bc.requestsInfo[i].num_tokens_in_batch; // assert(processed_tokens < request.tokens.size()); @@ -937,7 +940,8 @@ BeamSearchBatchConfig // // old_bc.beamRequestsInfo[i].max_depth); // // // new_bc.request_completed[i] = true; // // new_bc.request_completed[i] = false; - // // new_bc.requestsInfo[i].token_start_offset = processed_tokens; + // // new_bc.requestsInfo[i].first_token_depth_in_request = + // processed_tokens; // // new_bc.requestsInfo[i].request_guid = // // old_bc.requestsInfo[i].request_guid; // // new_bc.requestsInfo[i].max_sequence_length = @@ -953,7 +957,7 @@ BeamSearchBatchConfig log_req_mgr.debug() << "num tokens: " << old_bc.num_tokens << ", " << new_bc.num_tokens; new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].token_start_offset = processed_tokens; + new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -986,7 +990,8 @@ BeamSearchBatchConfig // do the slot exchange to minimize the cache exchange in kernel. // update_beam_metadata(new_bc, request.beam_trees.at(old_bc.model_id), // i); - if (new_bc.requestsInfo[i].token_start_offset >= request.tokens.size()) { + if (new_bc.requestsInfo[i].first_token_depth_in_request >= + request.tokens.size()) { // Incremental phase if (request.status == Request::RUNNING) { new_bc.requestsInfo[i].num_tokens_in_batch = 1; @@ -1006,7 +1011,7 @@ BeamSearchBatchConfig std::min(get_max_tokens_per_batch() - new_bc.num_tokens - BatchConfig::max_requests_per_batch() + i, (int)request.tokens.size() - - new_bc.requestsInfo[i].token_start_offset); + new_bc.requestsInfo[i].first_token_depth_in_request); request.ssm_cache_size += new_bc.requestsInfo[i].num_tokens_in_batch; if (verbose) { std::cout << "[ Beam Spec] " << request.guid << std::endl; @@ -1027,7 +1032,7 @@ BeamSearchBatchConfig // register more tokens due to the beam width for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].token_start_offset + j; + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; for (int k = 0; k < new_bc.sub_requests[i]; k++) { new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; @@ -1151,7 +1156,7 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = + new_bc.requestsInfo[i].first_token_depth_in_request = dfs_tree_inputs.front().second; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; @@ -1204,7 +1209,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( break; } - new_bc.requestsInfo[i].token_start_offset = request.tokens.size() - 1; + new_bc.requestsInfo[i].first_token_depth_in_request = + request.tokens.size() - 1; // Add Tokens from the DFS Tree to the next batch for (int j = 1; j < dfs_tree_inputs.size(); j++) { @@ -1257,7 +1263,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( } // Normal Request Info - new_bc.requestsInfo[i].token_start_offset = request.llm_cache_size; + new_bc.requestsInfo[i].first_token_depth_in_request = + request.llm_cache_size; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; new_bc.requestsInfo[i].max_sequence_length = @@ -1265,9 +1272,10 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - max_prompt_load_size, - (int)request.initial_len - new_bc.requestsInfo[i].token_start_offset); + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(max_prompt_load_size, + (int)request.initial_len - + new_bc.requestsInfo[i].first_token_depth_in_request); max_prompt_load_size -= new_bc.requestsInfo[i].num_tokens_in_batch; std::cout << "max_prompt_load_size: " << max_prompt_load_size @@ -1673,7 +1681,7 @@ std::vector> std::vector> RequestManager::traverse_beam_tree(BeamSearchBatchConfig const &old_bc, int request_index, - int token_start_offset) { + int first_token_depth_in_request) { if (verbose) { std::cout << "[Traverse Beam Tree] request_index: " << request_index << "\n"; @@ -1709,7 +1717,7 @@ std::vector> << serializedTree.size() << "\n"; } for (int k = 0; k < serializedTree.size(); k++) { - serializedTree.at(k).second += token_start_offset; + serializedTree.at(k).second += first_token_depth_in_request; if (verbose) { std::cout << "token id: " << serializedTree.at(k).first << ", depth: " << serializedTree.at(k).second << "\n"; diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index cb68ecc5f1..6dbcaceaa4 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -47,8 +47,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; - os << " Token start offset: " << bc.requestsInfo[i].token_start_offset - << std::endl; + os << " Token start offset: " + << bc.requestsInfo[i].first_token_depth_in_request << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; From 54084c430446a70c520d9240a8443ad905f22e72 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 17 Oct 2023 18:46:42 -0400 Subject: [PATCH 022/198] resolve conflict --- include/flexflow/model.h | 1 + src/ops/kernels/softmax.cu | 3 +- src/ops/layer_norm.cc | 41 +++++------ src/ops/layer_norm.cu | 45 +++++++----- src/ops/softmax.cc | 135 ++++++++++++++++++++++++++---------- src/runtime/batch_config.cc | 5 -- src/runtime/model.cc | 16 +++++ 7 files changed, 162 insertions(+), 84 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index faf969efb7..54a4cb1d37 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -132,6 +132,7 @@ enum TaskIDs { SOFTMAX_FWD_TASK_ID, SOFTMAX_BWD_TASK_ID, SOFTMAX_INF_TASK_ID, + SOFTMAX_PEFT_BWD_TASK_ID, CONCAT_INIT_TASK_ID, CONCAT_FWD_TASK_ID, CONCAT_BWD_TASK_ID, diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e31d508c95..f43bdfccbc 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -230,7 +230,8 @@ void forward_kernel(SoftmaxMeta const *m, } template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, cudaStream_t stream) { diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index b0d196a7c4..e9f8feae2b 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -866,23 +866,8 @@ void LayerNorm::backward_task(Task const *task, } else { assert(regions.size() == 3); } - if (m->output_type[0] == DT_FLOAT) { - LayerNorm::backward_kernel_wrapper(m, - output_grad.get_float_ptr(), - input.get_float_ptr(), - input_grad.get_float_ptr(), - gamma.get_float_ptr(), - gamma_grad.get_float_ptr(), - beta_grad.get_float_ptr()); - } else { - LayerNorm::backward_kernel_wrapper(m, - output_grad.get_half_ptr(), - input.get_half_ptr(), - input_grad.get_half_ptr(), - gamma.get_half_ptr(), - gamma_grad.get_half_ptr(), - beta_grad.get_half_ptr()); - } + LayerNorm::backward_kernel_wrapper( + m, output_grad, input, input_grad, gamma, gamma_grad, beta_grad); } bool LayerNorm::measure_operator_cost(Simulator *sim, @@ -933,16 +918,24 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, if (sim->computationMode == COMP_MODE_TRAINING) { float *in_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + GenericTensorAccessorW in_grad_acc( + inputs[0]->data_type, input_domain, in_grad_ptr); assert(in_grad_ptr != NULL); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *out_grad_ptr = NULL; out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorR out_grad_acc( + outputs[0]->data_type, output_domain, out_grad_ptr); assert(out_grad_ptr != NULL); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + GenericTensorAccessorW gamma_grad_acc( + outputs[0]->data_type, output_domain, gamma_grad_ptr); + GenericTensorAccessorW beta_grad_acc( + outputs[0]->data_type, output_domain, beta_grad_ptr); out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && @@ -954,13 +947,13 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, } backward = [=] { - backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); + backward_kernel_wrapper(m, + out_grad_acc, + input1_acc, + in_grad_acc, + gamma_acc, + gamma_grad_acc, + beta_grad_acc); }; } diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index a59fa39b78..e242904775 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -718,24 +718,37 @@ void LayerNorm::peft_bwd_kernel_wrapper( } /*static*/ -template -void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T const *input_ptr, - T *input_grad_ptr, - T const *gamma_ptr, - T *gamma_grad_ptr, - T *beta_grad_ptr) { +void LayerNorm::backward_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr(), + stream); + } else if (m->output_type[0] == DT_HALF) { + LayerNorm::backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + gamma_grad.get_half_ptr(), + beta_grad.get_half_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } } } // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 9e0f68c906..d0e38c8017 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -233,44 +233,6 @@ OpMeta *Softmax::init_task(Task const *task, return m; } -FutureMap Softmax::inference(FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - size_t machine_view_hash = view->hash(); - /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv - << std::endl; */ - IndexLauncher launcher(SOFTMAX_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - return runtime->execute_index_space(ctx, launcher); -} - void Softmax::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -358,6 +320,44 @@ void Softmax::backward_task(Task const *task, backward_kernel_wrapper(m, input_grad, output_grad); } +FutureMap Softmax::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(SOFTMAX_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + void Softmax::inference_task(Task const *task, std::vector const ®ions, Context ctx, @@ -385,6 +385,65 @@ void Softmax::inference_task(Task const *task, } } +FutureMap Softmax::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void Softmax::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + assert(regions.size() == 2); + assert(task->regions.size() == 2); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_tokens == 0) { + return; + } + Domain in_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); +} + bool Softmax::get_int_parameter(PMParameter para, int *value) const { switch (para) { case PM_SOFTMAX_DIM: diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 425b8eeda3..1a6e32e582 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -116,15 +116,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { for (int i = 0; i < bc.max_requests_per_batch(); i++) { if (!bc.request_completed[i]) { os << " Request " << i << ":\n"; -<<<<<<< HEAD os << " First token depth in request: " << bc.requestsInfo[i].first_token_depth_in_request << std::endl; os << " First token offset in batch: " << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; -======= - os << " Token start offset: " - << bc.requestsInfo[i].first_token_depth_in_request << std::endl; ->>>>>>> 4c06a0907ec694b21a989a51120e846d0f0cfa74 os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 69a7f3786f..e94606718a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5627,6 +5627,22 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(SOFTMAX_PEFT_BWD_TASK_ID, + "Softmax PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Softmax PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // compute Loss { TaskVariantRegistrar registrar(LOSS_BWD_TASK_ID, "Loss Backward"); From a44e33dde3a310ffb493fc603927cb40d1dbbc29 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 17 Oct 2023 23:00:04 -0400 Subject: [PATCH 023/198] add missing functions --- include/flexflow/model.h | 4 +- .../ops/inc_multihead_self_attention.h | 9 + include/flexflow/parallel_ops/allreduce.h | 19 +- src/ops/fused.cu | 6 - src/ops/inc_multihead_self_attention.cc | 124 +++++++++ src/ops/inc_multihead_self_attention.cu | 76 ++++++ src/ops/kernels/softmax.cu | 2 +- src/parallel_ops/allreduce.cc | 250 +++++++++++------- src/parallel_ops/kernels/allreduce_kernels.cu | 51 +++- src/runtime/inference_manager.cc | 3 + src/runtime/model.cc | 60 ++++- 11 files changed, 470 insertions(+), 134 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 54a4cb1d37..ac24e90900 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -177,6 +177,7 @@ enum TaskIDs { INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, @@ -241,9 +242,10 @@ enum TaskIDs { PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, ALLREDUCE_INIT_TASK_ID, - ALLREDUCE_INF_TASK_ID, ALLREDUCE_FWD_TASK_ID, ALLREDUCE_BWD_TASK_ID, + ALLREDUCE_INF_TASK_ID, + ALLREDUCE_PEFT_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index c220091174..76569de4cb 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -96,6 +96,11 @@ class IncMultiHeadSelfAttention : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -109,6 +114,10 @@ class IncMultiHeadSelfAttention : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h index 045f9b36a0..7e0e4362e2 100644 --- a/include/flexflow/parallel_ops/allreduce.h +++ b/include/flexflow/parallel_ops/allreduce.h @@ -34,12 +34,17 @@ class AllReduce : public ParallelOp { std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; + void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; @@ -47,10 +52,6 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void inference_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -59,6 +60,14 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 88eefc7e82..692316c6d4 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1187,13 +1187,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(false && "Fusion currently does not support type"); } } - ioff += fused->op_num_inputs[op]; - woff += fused->op_num_weights[op]; - ooff += fused->op_num_outputs[op]; } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); } /* diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index c8e7ba72f4..5cf4dbdf7c 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -876,6 +876,130 @@ void IncMultiHeadSelfAttention::inference_task( } } +FutureMap IncMultiHeadSelfAttention::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(idx++, FID_DATA); + if (qkv_bias || final_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(idx++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void IncMultiHeadSelfAttention::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", + bc->num_tokens, + bc->num_active_requests()); + if (bc->num_tokens == 0) { + return; + } + + IncMultiHeadSelfAttentionMeta *m = + *((IncMultiHeadSelfAttentionMeta **)task->local_args); + + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 4); + } + + Domain input_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_grad_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 2); + assert(output_grad_domain.get_dim() == 4); + + assert(task->index_point.get_dim() == 1); + + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + input_grad, + weight, + output_grad, + biases); +} + void IncMultiHeadSelfAttention::backward(FFModel const &ff) { // IncMultiHeadSelfAttention does not support backward assert(false); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 6cc0796c85..d92862ba30 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -446,6 +446,18 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); } +template +void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *input_grad_ptr, + DT const *weight_ptr, + DT const *output_grad_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + assert(false); +} + } // namespace IncMultiHeadAttention } // namespace Kernels @@ -842,6 +854,70 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( } } +/*static*/ +void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &bias) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->qkv_bias || *m->final_bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // assert(input.data_type == weight.data_type); + assert(input_grad.data_type == output_grad.data_type); + if (use_bias) { + assert(input_grad.data_type == bias.data_type); + } + + if (input_grad.data_type == DT_HALF) { + assert(!m->offload); + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_half_ptr(), + weight.get_half_ptr(), + output_grad.get_half_ptr(), + bias_ptr, + stream); + } else if (input_grad.data_type == DT_FLOAT) { + assert(m->offload); + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_float_ptr(), + weight.get_float_ptr(), + output_grad.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed); + } +} + IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index f43bdfccbc..9ccce40c58 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -24,7 +24,7 @@ using Legion::Domain; SoftmaxMeta::SoftmaxMeta(FFHandler handler, Softmax const *softmax, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, softmax) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( inputTensor, input_domain, softmax->data_type)); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 027d15c929..62e152b36c 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -143,6 +143,102 @@ void AllReduce::init(FFModel const &ff) { set_opmeta_from_futuremap(ff, fm); } +void AllReduce::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void AllReduce::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void AllReduce::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + void AllReduce::init_inference(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -221,64 +317,84 @@ FutureMap AllReduce::inference(FFModel const &ff, return runtime->execute_index_space(ctx, launcher); } -void AllReduce::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = outputs[0]->parallel_is; - assert(numOutputs == 1); - assert(numInputs == 1); - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, - outputs[0]->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); +/*static*/ +void AllReduce::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + inference_kernel_wrapper(m, bc, input, output); } -void AllReduce::backward(FFModel const &ff) { +FutureMap AllReduce::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; assert(numOutputs == 1); assert(numInputs == 1); - IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, - inputs[0]->parallel_is, - TaskArgument(NULL, 0), + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - inputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - inputs[0]->region_grad)); + batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - outputs[0]->region_grad)); + batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void AllReduce::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); } bool AllReduce::measure_operator_cost(Simulator *sim, @@ -315,62 +431,6 @@ bool AllReduce::append_parallel_op_info( return true; } -/*static*/ -void AllReduce::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input.data_type == output.data_type); - inference_kernel_wrapper(m, bc, input, output); -} - -/*static*/ -void AllReduce::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input.data_type == output.data_type); - forward_kernel_wrapper(m, input, output); -} - -void AllReduce::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - - GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input_grad.data_type == output_grad.data_type); - backward_kernel_wrapper(m, input_grad, output_grad); -} - }; // namespace FlexFlow namespace std { diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 2c000137a1..5861f05d7a 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -24,21 +24,18 @@ AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); - size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; - size_t num_elements = bc->num_tokens * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - num_elements, + input.domain.get_volume(), nccl_data_type, ncclSum, m->handle.ncclComm, @@ -48,18 +45,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m, #endif } -void forward_kernel_wrapper(AllReduceMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - input.domain.get_volume(), + num_elements, nccl_data_type, ncclSum, m->handle.ncclComm, @@ -69,10 +75,29 @@ void forward_kernel_wrapper(AllReduceMeta const *m, #endif } -void backward_kernel_wrapper(AllReduceMeta const *m, +void peft_bwd_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } } // namespace AllReduce diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 5d81fa4664..0f71291ded 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -395,6 +395,9 @@ void InferenceManager::peft_bwd(FFModel *model, if (op->op_type == OP_WEIGHT) { continue; } + if (op->op_type == OP_INPUT) { + continue; + } std::vector inputs(op->numInputs); std::vector outputs(op->numOutputs); for (int i = 0; i < op->numInputs; i++) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index e94606718a..04a847b023 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6229,6 +6229,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar( + INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, + "IncMultiHeadSelfAttention PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + IncMultiHeadSelfAttention::peft_bwd_task>( + registrar, "IncMultiHeadSelfAttention PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // speculative MultiHeadAttention task { TaskVariantRegistrar registrar( @@ -6651,48 +6669,64 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID, - "AllReduce Inference"); + TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "AllReduce Inference Task"); + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } { - TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "AllReduce Forward Task"); + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } { - TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID, + "AllReduce Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "AllReduce Backward Task"); + Runtime::preregister_task_variant( + registrar, "AllReduce Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_PEFT_BWD_TASK_ID, + "AllReduce PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); } } + // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, From 4d55b4079dc3612e5f0206f6f0a4161f22230b3d Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 18 Oct 2023 00:50:40 -0400 Subject: [PATCH 024/198] remove OpMeta(FFhandler) constructor --- include/flexflow/op_meta.h | 2 +- include/flexflow/ops/aggregate.h | 4 +- include/flexflow/ops/aggregate_spec.h | 4 +- include/flexflow/ops/cache.h | 4 +- include/flexflow/ops/element_unary.h | 4 +- include/flexflow/ops/experts.h | 15 +---- include/flexflow/ops/groupby.h | 4 +- .../ops/kernels/batch_matmul_kernels.h | 4 +- include/flexflow/ops/kernels/cast_kernels.h | 4 +- include/flexflow/ops/kernels/concat_kernels.h | 4 +- .../flexflow/ops/kernels/conv_2d_kernels.h | 4 +- include/flexflow/ops/kernels/flat_kernels.h | 4 +- .../flexflow/ops/kernels/pool_2d_kernels.h | 4 +- .../flexflow/ops/kernels/reshape_kernels.h | 6 +- .../flexflow/ops/kernels/transpose_kernels.h | 4 +- include/flexflow/ops/topk.h | 4 +- include/flexflow/ops/transpose.h | 2 + .../parallel_ops/kernels/combine_kernels.h | 4 +- .../parallel_ops/kernels/partition_kernels.h | 4 +- include/flexflow/simulator.h | 56 +++++++++---------- src/ops/add_bias_residual_layer_norm.cu | 2 +- src/ops/aggregate.cc | 4 +- src/ops/aggregate.cpp | 9 +-- src/ops/aggregate.cu | 7 ++- src/ops/aggregate_spec.cc | 4 +- src/ops/aggregate_spec.cpp | 7 ++- src/ops/aggregate_spec.cu | 7 ++- src/ops/attention.cpp | 2 +- src/ops/attention.cu | 2 +- src/ops/batch_matmul.cc | 4 +- src/ops/batch_norm.cpp | 2 +- src/ops/batch_norm.cu | 2 +- src/ops/beam_topk.cpp | 2 +- src/ops/beam_topk.cu | 2 +- src/ops/cache.cc | 2 +- src/ops/cache.cpp | 2 +- src/ops/cache.cu | 2 +- src/ops/cast.cc | 2 +- src/ops/concat.cc | 4 +- src/ops/conv_2d.cc | 4 +- src/ops/element_unary.cc | 4 +- src/ops/element_unary.cpp | 3 +- src/ops/element_unary.cu | 3 +- src/ops/experts.cc | 13 +---- src/ops/experts.cpp | 28 +++------- src/ops/experts.cu | 27 +++------ src/ops/flat.cc | 3 +- src/ops/group_by.cc | 4 +- src/ops/group_by.cpp | 6 +- src/ops/group_by.cu | 6 +- src/ops/kernels/batch_matmul.cpp | 4 +- src/ops/kernels/batch_matmul.cu | 4 +- src/ops/kernels/cast_kernels.cpp | 3 +- src/ops/kernels/cast_kernels.cu | 3 +- src/ops/kernels/concat_kernels.cpp | 4 ++ src/ops/kernels/concat_kernels.cu | 4 ++ src/ops/kernels/conv_2d_kernels.cpp | 4 +- src/ops/kernels/conv_2d_kernels.cu | 4 +- src/ops/kernels/dropout_kernels.cpp | 2 +- src/ops/kernels/dropout_kernels.cu | 2 +- src/ops/kernels/flat_kernels.cpp | 4 ++ src/ops/kernels/flat_kernels.cu | 4 ++ src/ops/kernels/pool_2d_kernels.cpp | 4 +- src/ops/kernels/pool_2d_kernels.cu | 4 +- src/ops/kernels/reshape_kernels.cpp | 4 +- src/ops/kernels/reshape_kernels.cu | 4 +- src/ops/kernels/transpose_kernels.cpp | 4 ++ src/ops/kernels/transpose_kernels.cu | 4 ++ src/ops/layer_norm.cc | 3 +- src/ops/layer_norm.cpp | 2 +- src/ops/layer_norm.cu | 2 +- src/ops/linear.cc | 5 +- src/ops/mean.cc | 3 +- src/ops/noop.cc | 4 +- src/ops/pool_2d.cc | 4 +- src/ops/reduce.cpp | 2 +- src/ops/reduce.cu | 2 +- src/ops/reshape.cc | 2 +- src/ops/residual_layer_norm.cpp | 2 +- src/ops/residual_layer_norm.cu | 2 +- src/ops/sigmoid_silu_multi.cpp | 2 +- src/ops/sigmoid_silu_multi.cu | 2 +- src/ops/topk.cc | 4 +- src/ops/topk.cpp | 3 +- src/ops/topk.cu | 3 +- src/ops/transpose.cc | 4 +- src/parallel_ops/combine.cc | 2 +- .../kernels/allreduce_kernels.cpp | 2 +- src/parallel_ops/kernels/allreduce_kernels.cu | 2 +- src/parallel_ops/kernels/combine_kernels.cpp | 4 +- src/parallel_ops/kernels/combine_kernels.cu | 4 +- .../kernels/partition_kernels.cpp | 4 +- src/parallel_ops/kernels/partition_kernels.cu | 4 +- .../kernels/reduction_kernels.cpp | 2 +- src/parallel_ops/kernels/reduction_kernels.cu | 2 +- .../kernels/replicate_kernels.cpp | 2 +- src/parallel_ops/kernels/replicate_kernels.cu | 2 +- src/runtime/inference_manager.cc | 17 ++++++ src/runtime/model.cc | 9 ++- src/runtime/simulator.cpp | 22 ++++---- src/runtime/simulator.cu | 26 ++++----- 101 files changed, 305 insertions(+), 226 deletions(-) diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h index dae3953490..dcf070c975 100644 --- a/include/flexflow/op_meta.h +++ b/include/flexflow/op_meta.h @@ -9,7 +9,7 @@ class Op; class OpMeta { public: - OpMeta(FFHandler _handle); + // OpMeta(FFHandler _handle); OpMeta(FFHandler _handle, Op const *op); public: diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 3ba4f414d1..283e9a4290 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -11,9 +11,11 @@ namespace FlexFlow { #define AGGREGATE_MAX_BATCH_SIZE 64 #define AGGREGATE_MAX_N 128 +class Aggregate; + class AggregateMeta : public OpMeta { public: - AggregateMeta(FFHandler handle, int n); + AggregateMeta(FFHandler handle, Aggregate const *aggr); ~AggregateMeta(void); float **dev_exp_preds; float **dev_exp_grads; diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 4302dd0733..a9f651b620 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -11,9 +11,11 @@ namespace FlexFlow { #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32 #define AGGREGATE_SPEC_MAX_N 12 +class AggregateSpec; + class AggregateSpecMeta : public OpMeta { public: - AggregateSpecMeta(FFHandler handle, int n); + AggregateSpecMeta(FFHandler handle, AggregateSpec const *agg); ~AggregateSpecMeta(void); float **dev_region_ptrs; }; diff --git a/include/flexflow/ops/cache.h b/include/flexflow/ops/cache.h index 1fbb1fa059..4f0b94ee5c 100644 --- a/include/flexflow/ops/cache.h +++ b/include/flexflow/ops/cache.h @@ -5,9 +5,11 @@ namespace FlexFlow { +class Cache; + class CacheMeta : public OpMeta { public: - CacheMeta(FFHandler handle); + CacheMeta(FFHandler handle, Cache const *c); float cache_score; }; diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index ddef59549c..043b5d19a7 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -12,9 +12,11 @@ namespace FlexFlow { +class ElementUnary; + class ElementUnaryMeta : public OpMeta { public: - ElementUnaryMeta(FFHandler handle); + ElementUnaryMeta(FFHandler handle, ElementUnary const *unary); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnActivationDescriptor_t actiDesc; diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index f132003d30..1ed4678a5b 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -6,20 +6,11 @@ namespace FlexFlow { +class Experts; + class ExpertsMeta : public OpMeta { public: - ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation); + ExpertsMeta(FFHandler handler, Experts const *e); ~ExpertsMeta(void); // Thrust helper arrays diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index ec6cdfb9ab..73025216cd 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Group_by; + class GroupByMeta : public OpMeta { public: - GroupByMeta(FFHandler handle, int n, float _alpha); + GroupByMeta(FFHandler handle, Group_by const *gb); ~GroupByMeta(void); float alpha; float **dev_region_ptrs; diff --git a/include/flexflow/ops/kernels/batch_matmul_kernels.h b/include/flexflow/ops/kernels/batch_matmul_kernels.h index 4de774ee06..c3923c4d4b 100644 --- a/include/flexflow/ops/kernels/batch_matmul_kernels.h +++ b/include/flexflow/ops/kernels/batch_matmul_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class BatchMatmul; + class BatchMatmulMeta : public OpMeta { public: - BatchMatmulMeta(FFHandler handler); + BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm); int a_seq_length_dim, b_seq_length_dim; }; diff --git a/include/flexflow/ops/kernels/cast_kernels.h b/include/flexflow/ops/kernels/cast_kernels.h index 3001d913ca..d601601ea2 100644 --- a/include/flexflow/ops/kernels/cast_kernels.h +++ b/include/flexflow/ops/kernels/cast_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Cast; + class CastMeta : public OpMeta { public: - CastMeta(FFHandler handle); + CastMeta(FFHandler handle, Cast const *cast); DataType input_data_type, output_data_type; }; diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h index 4da6aaf5e2..4562ae871a 100644 --- a/include/flexflow/ops/kernels/concat_kernels.h +++ b/include/flexflow/ops/kernels/concat_kernels.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Concat; + class ConcatMeta : public OpMeta { public: - ConcatMeta(FFHandler handle) : OpMeta(handle){}; + ConcatMeta(FFHandler handle, Concat const *cc); int legion_axis; }; diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h index 7b2a0fe135..f83e4687d7 100644 --- a/include/flexflow/ops/kernels/conv_2d_kernels.h +++ b/include/flexflow/ops/kernels/conv_2d_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Conv2D; + class Conv2DMeta : public OpMeta { public: - Conv2DMeta(FFHandler handler); + Conv2DMeta(FFHandler handler, Conv2D const *conv); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor, biasTensor, outputTensor; cudnnFilterDescriptor_t filterDesc; diff --git a/include/flexflow/ops/kernels/flat_kernels.h b/include/flexflow/ops/kernels/flat_kernels.h index caf817512d..6aa5a13b42 100644 --- a/include/flexflow/ops/kernels/flat_kernels.h +++ b/include/flexflow/ops/kernels/flat_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Flat; + class FlatMeta : public OpMeta { public: - FlatMeta(FFHandler handle) : OpMeta(handle){}; + FlatMeta(FFHandler handle, Flat const *flat); }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h index 7f73a8295d..c5a954763e 100644 --- a/include/flexflow/ops/kernels/pool_2d_kernels.h +++ b/include/flexflow/ops/kernels/pool_2d_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Pool2D; + class Pool2DMeta : public OpMeta { public: - Pool2DMeta(FFHandler handle); + Pool2DMeta(FFHandler handle, Pool2D const *pool); ffTensorDescriptor_t inputTensor, outputTensor; ffActivationDescriptor_t actiDesc; ffPoolingDescriptor_t poolDesc; diff --git a/include/flexflow/ops/kernels/reshape_kernels.h b/include/flexflow/ops/kernels/reshape_kernels.h index e6c8c4d569..5b6fa5be19 100644 --- a/include/flexflow/ops/kernels/reshape_kernels.h +++ b/include/flexflow/ops/kernels/reshape_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Reshape; + class ReshapeMeta : public OpMeta { public: - ReshapeMeta(FFHandler handler); + ReshapeMeta(FFHandler handler, Reshape const *reshape); DataType data_type; }; @@ -44,4 +46,4 @@ void backward_kernel(T *input_grad_ptr, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H \ No newline at end of file +#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/include/flexflow/ops/kernels/transpose_kernels.h b/include/flexflow/ops/kernels/transpose_kernels.h index 7ff6163b30..a2c8ff0483 100644 --- a/include/flexflow/ops/kernels/transpose_kernels.h +++ b/include/flexflow/ops/kernels/transpose_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Transpose; + class TransposeMeta : public OpMeta { public: - TransposeMeta(FFHandler handler) : OpMeta(handler){}; + TransposeMeta(FFHandler handler, Transpose const *transpose); int num_dim; int perm[MAX_TENSOR_DIM]; }; diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 47144bf6d7..4b67692032 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class TopK; + class TopKMeta : public OpMeta { public: - TopKMeta(FFHandler handle); + TopKMeta(FFHandler handle, TopK const *topk); bool sorted; }; diff --git a/include/flexflow/ops/transpose.h b/include/flexflow/ops/transpose.h index 3e6fb575c0..bca0b83460 100644 --- a/include/flexflow/ops/transpose.h +++ b/include/flexflow/ops/transpose.h @@ -6,6 +6,8 @@ namespace FlexFlow { +class TransposeMeta; + class Transpose : public Op { public: using Params = TransposeParams; diff --git a/include/flexflow/parallel_ops/kernels/combine_kernels.h b/include/flexflow/parallel_ops/kernels/combine_kernels.h index 456013cd81..4b2227b178 100644 --- a/include/flexflow/parallel_ops/kernels/combine_kernels.h +++ b/include/flexflow/parallel_ops/kernels/combine_kernels.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Combine; + class CombineMeta : public OpMeta { public: - CombineMeta(FFHandler handle); + CombineMeta(FFHandler handle, Combine const *comb); DataType data_type; }; diff --git a/include/flexflow/parallel_ops/kernels/partition_kernels.h b/include/flexflow/parallel_ops/kernels/partition_kernels.h index 81b190603a..1e77090d11 100644 --- a/include/flexflow/parallel_ops/kernels/partition_kernels.h +++ b/include/flexflow/parallel_ops/kernels/partition_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Repartition; + class RepartitionMeta : public OpMeta { public: - RepartitionMeta(FFHandler handle); + RepartitionMeta(FFHandler handle, Repartition const *repart); DataType data_type; }; diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h index e410f66325..6cda96aa8b 100644 --- a/include/flexflow/simulator.h +++ b/include/flexflow/simulator.h @@ -33,21 +33,21 @@ namespace FlexFlow { #define MOD(a, b) ((a) % (b)) < 0 ? ((a) % (b)) + (b) : ((a) % (b)) -class Conv2DMeta; -class LinearMeta; -class Pool2DMeta; -class ElementUnaryMeta; -class ElementBinaryMeta; -class LayerNormMeta; -// class EmbeddingMeta; -// class SoftmaxMeta; -class BatchMatmulMeta; -// class BatchNormMeta; -class ConcatMeta; -// class DropoutMeta; -class TransposeMeta; -class Op; -class FFModel; +// class Conv2DMeta; +// class LinearMeta; +// class Pool2DMeta; +// class ElementUnaryMeta; +// class ElementBinaryMeta; +// class LayerNormMeta; +// class EmbeddingMeta; +// class SoftmaxMeta; +// class BatchMatmulMeta; +// class BatchNormMeta; +// class ConcatMeta; +// class DropoutMeta; +// class TransposeMeta; +// class Op; +// class FFModel; /** * @brief Costs of an operator. @@ -751,19 +751,19 @@ class Simulator { strict_hash_to_operator_cost; public: - Conv2DMeta *conv2d_meta; - LinearMeta *linear_meta; - Pool2DMeta *pool2d_meta; - ElementUnaryMeta *ele_unary_meta; - LayerNormMeta *layernorm_meta; - // ElementBinaryMeta *ele_binary_meta; - // EmbeddingMeta *embedding_meta; - // SoftmaxMeta *softmax_meta; - BatchMatmulMeta *batch_matmul_meta; - // BatchNormMeta *batch_norm_meta; - ConcatMeta *concat_meta; - // DropoutMeta *dropout_meta; - TransposeMeta *transpose_meta; + // Conv2DMeta *conv2d_meta; + // LinearMeta *linear_meta; + // Pool2DMeta *pool2d_meta; + // ElementUnaryMeta *ele_unary_meta; + // LayerNormMeta *layernorm_meta; + // ElementBinaryMeta *ele_binary_meta; + // EmbeddingMeta *embedding_meta; + // SoftmaxMeta *softmax_meta; + // BatchMatmulMeta *batch_matmul_meta; + // BatchNormMeta *batch_norm_meta; + // ConcatMeta *concat_meta; + // DropoutMeta *dropout_meta; + // TransposeMeta *transpose_meta; int segment_size; int max_num_segments; // simulation could be slow if the number of segments // are too large diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index ceb1a6514e..07f1f2af6b 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -27,7 +27,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( FFHandler handle, AddBiasResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 67810d3f5b..b021a50ee1 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -233,7 +233,7 @@ OpMeta *Aggregate::init_task(Task const *task, Runtime *runtime) { Aggregate *agg = (Aggregate *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - AggregateMeta *m = new AggregateMeta(handle, agg->n); + AggregateMeta *m = new AggregateMeta(handle, agg); m->profiling = agg->profiling; m->inference_debugging = agg->inference_debugging; std::strcpy(m->op_name, agg->name); @@ -592,7 +592,7 @@ bool Aggregate::measure_operator_cost(Simulator *sim, return false; } - AggregateMeta *m = new AggregateMeta(sim->handler, n); + AggregateMeta *m = new AggregateMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/aggregate.cpp b/src/ops/aggregate.cpp index d5ebdb0c22..5a508cfac4 100644 --- a/src/ops/aggregate.cpp +++ b/src/ops/aggregate.cpp @@ -281,13 +281,14 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, out_dim); } -AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) { - checkCUDA(hipMalloc(&dev_exp_preds, n * sizeof(float *))); - checkCUDA(hipMalloc(&dev_exp_grads, n * sizeof(float *))); +AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(hipMalloc(&dev_exp_preds, aggr->n * sizeof(float *))); + checkCUDA(hipMalloc(&dev_exp_grads, aggr->n * sizeof(float *))); } AggregateMeta::~AggregateMeta(void) { checkCUDA(hipFree(&dev_exp_preds)); checkCUDA(hipFree(&dev_exp_grads)); } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/aggregate.cu b/src/ops/aggregate.cu index 38e141b252..9704302092 100644 --- a/src/ops/aggregate.cu +++ b/src/ops/aggregate.cu @@ -307,9 +307,10 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, } } -AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) { - checkCUDA(cudaMalloc(&dev_exp_preds, n * sizeof(float *))); - checkCUDA(cudaMalloc(&dev_exp_grads, n * sizeof(float *))); +AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(cudaMalloc(&dev_exp_preds, aggr->n * sizeof(float *))); + checkCUDA(cudaMalloc(&dev_exp_grads, aggr->n * sizeof(float *))); } AggregateMeta::~AggregateMeta(void) { checkCUDA(cudaFree(&dev_exp_preds)); diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 19b2edc14a..32bd56e215 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -207,7 +207,7 @@ OpMeta *AggregateSpec::init_task(Task const *task, Runtime *runtime) { AggregateSpec *agg = (AggregateSpec *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg->n); + AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg); m->profiling = agg->profiling; m->inference_debugging = agg->inference_debugging; std::strcpy(m->op_name, agg->name); @@ -540,7 +540,7 @@ bool AggregateSpec::measure_operator_cost(Simulator *sim, return false; } - AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, n); + AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/aggregate_spec.cpp b/src/ops/aggregate_spec.cpp index 314e20a59c..a676fa81c3 100644 --- a/src/ops/aggregate_spec.cpp +++ b/src/ops/aggregate_spec.cpp @@ -290,9 +290,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m, out_dim); } -AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n) - : OpMeta(handler) { - checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *))); +AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, + AggregateSpec const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(hipMalloc(&dev_region_ptrs, aggr->n * sizeof(float *))); } AggregateSpecMeta::~AggregateSpecMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); diff --git a/src/ops/aggregate_spec.cu b/src/ops/aggregate_spec.cu index 8d50d45d21..ac5a372efc 100644 --- a/src/ops/aggregate_spec.cu +++ b/src/ops/aggregate_spec.cu @@ -287,9 +287,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m, out_dim); } -AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n) - : OpMeta(handler) { - checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *))); +AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, + AggregateSpec const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(cudaMalloc(&dev_region_ptrs, aggr->n * sizeof(float *))); } AggregateSpecMeta::~AggregateSpecMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); diff --git a/src/ops/attention.cpp b/src/ops/attention.cpp index ee7f87a7fb..10655a4a1a 100644 --- a/src/ops/attention.cpp +++ b/src/ops/attention.cpp @@ -156,7 +156,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, Memory gpu_mem, int num_samples, int num_heads) - : OpMeta(handler) { + : OpMeta(handler, attn) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); diff --git a/src/ops/attention.cu b/src/ops/attention.cu index 9b8b90da70..59834b1300 100644 --- a/src/ops/attention.cu +++ b/src/ops/attention.cu @@ -194,7 +194,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, Memory gpu_mem, int num_samples, int num_heads) - : OpMeta(handler) { + : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc index f4b06877e5..77b7be2ba8 100644 --- a/src/ops/batch_matmul.cc +++ b/src/ops/batch_matmul.cc @@ -272,7 +272,7 @@ OpMeta *BatchMatmul::init_task(Task const *task, Runtime *runtime) { BatchMatmul const *bmm = (BatchMatmul *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - BatchMatmulMeta *m = new BatchMatmulMeta(handle); + BatchMatmulMeta *m = new BatchMatmulMeta(handle, bmm); m->profiling = bmm->profiling; m->inference_debugging = bmm->inference_debugging; m->a_seq_length_dim = bmm->a_seq_length_dim; @@ -609,7 +609,7 @@ bool BatchMatmul::measure_operator_cost(Simulator *sim, batch *= sub_input0.dims[i].size; } - BatchMatmulMeta *meta = sim->batch_matmul_meta; + BatchMatmulMeta *meta = new BatchMatmulMeta(sim->handler, this); // allocate tensors in simulator sim->free_all(); diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp index 106e5ebad2..933be29197 100644 --- a/src/ops/batch_norm.cpp +++ b/src/ops/batch_norm.cpp @@ -287,7 +287,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, int output_c, int output_h, int output_w) - : OpMeta(handler) { + : OpMeta(handler, bn) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu index b77e9d489f..ffbdef9f01 100644 --- a/src/ops/batch_norm.cu +++ b/src/ops/batch_norm.cu @@ -273,7 +273,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, int output_c, int output_h, int output_w) - : OpMeta(handler) { + : OpMeta(handler, bn) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 18534455a0..a570e6ff17 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -681,7 +681,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { + : OpMeta(handler, op) { DataType data_type = op->inputs[0]->data_type; int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); int max_requests_per_batch = BatchConfig::max_requests_per_batch(); diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index 72ab7862a6..a79070c346 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -714,7 +714,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { + : OpMeta(handler, op) { DataType data_type = op->inputs[0]->data_type; int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); int max_requests_per_batch = BatchConfig::max_requests_per_batch(); diff --git a/src/ops/cache.cc b/src/ops/cache.cc index 691e45b559..33b862ae85 100644 --- a/src/ops/cache.cc +++ b/src/ops/cache.cc @@ -165,7 +165,7 @@ OpMeta *Cache::init_task(Task const *task, Runtime *runtime) { Cache *c = (Cache *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - CacheMeta *m = new CacheMeta(handle); + CacheMeta *m = new CacheMeta(handle, c); m->cache_score = 0.0f; m->profiling = c->profiling; m->inference_debugging = c->inference_debugging; diff --git a/src/ops/cache.cpp b/src/ops/cache.cpp index 95c5995f9e..a9512c2c59 100644 --- a/src/ops/cache.cpp +++ b/src/ops/cache.cpp @@ -75,7 +75,7 @@ float Cache::cache_update(Task const *task, return cache_score; } -CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {} +CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {} template void Cache::cache_forward(Task const *task, diff --git a/src/ops/cache.cu b/src/ops/cache.cu index a113e57a1c..2f95e59669 100644 --- a/src/ops/cache.cu +++ b/src/ops/cache.cu @@ -74,7 +74,7 @@ float Cache::cache_update(Task const *task, return cache_score; } -CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {} +CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {} template void Cache::cache_forward(Task const *task, diff --git a/src/ops/cast.cc b/src/ops/cast.cc index 2a845cb303..f182f16e00 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -190,7 +190,7 @@ OpMeta *Cast::init_task(Task const *task, Runtime *runtime) { Cast *cast = (Cast *)task->args; FFHandler handler = *((FFHandler const *)task->local_args); - CastMeta *m = new CastMeta(handler); + CastMeta *m = new CastMeta(handler, cast); m->input_data_type = cast->inputs[0]->data_type; m->output_data_type = cast->outputs[0]->data_type; std::strcpy(m->op_name, cast->name); diff --git a/src/ops/concat.cc b/src/ops/concat.cc index 80935e387b..89e5e299c7 100644 --- a/src/ops/concat.cc +++ b/src/ops/concat.cc @@ -197,7 +197,7 @@ OpMeta *Concat::init_task(Task const *task, Runtime *runtime) { Concat *cc = (Concat *)task->args; FFHandler handler = *((FFHandler const *)task->local_args); - ConcatMeta *m = new ConcatMeta(handler); + ConcatMeta *m = new ConcatMeta(handler, cc); // Note that our internal axis index ordering is opposite to other frameworks init_meta(m, cc->legion_axis); m->profiling = cc->profiling; @@ -365,7 +365,7 @@ bool Concat::measure_operator_cost(Simulator *sim, } } - ConcatMeta *m = sim->concat_meta; + ConcatMeta *m = new ConcatMeta(sim->handler, this); init_meta(m, this->legion_axis); sim->free_all(); diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index 7c524c81de..e48fa9d794 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -588,7 +588,7 @@ OpMeta *Conv2D::init_task(Task const *task, // regions[4], task->regions[4], FID_DATA, ctx, runtime, // false/*readOutput*/); - Conv2DMeta *m = new Conv2DMeta(handle); + Conv2DMeta *m = new Conv2DMeta(handle, conv); m->relu = conv->activation == AC_MODE_RELU; m->use_bias = conv->use_bias; m->profiling = conv->profiling; @@ -1113,7 +1113,7 @@ bool Conv2D::measure_operator_cost(Simulator *sim, int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; - Conv2DMeta *m = sim->conv2d_meta; + Conv2DMeta *m = new Conv2DMeta(sim->handler, this); m->relu = activation == AC_MODE_RELU; // require input_c is divisible by groups diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 9fb2e6dc1f..844aeb6de3 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -354,7 +354,7 @@ OpMeta *ElementUnary::init_task(Task const *task, Runtime *runtime) { ElementUnary *eu = (ElementUnary *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - ElementUnaryMeta *m = new ElementUnaryMeta(handle); + ElementUnaryMeta *m = new ElementUnaryMeta(handle, eu); m->op_type = eu->op_type; m->data_type = eu->outputs[0]->data_type; // Input and output should have the same data type @@ -735,7 +735,7 @@ bool ElementUnary::measure_operator_cost(Simulator *sim, if (!inputs[0]->get_sub_tensor(mv, sub_input)) { return false; } - ElementUnaryMeta *m = sim->ele_unary_meta; + ElementUnaryMeta *m = new ElementUnaryMeta(sim->handler, this); m->op_type = op_type; if (use_cudnn(m->op_type)) { Domain input_domain, output_domain; diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index e20200420f..435abdfe11 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -282,7 +282,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, stream); } -ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { +ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary) + : OpMeta(handler, unary) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index c7f5e90f4c..15e6852388 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -291,7 +291,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, stream); } -ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { +ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary) + : OpMeta(handler, unary) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); diff --git a/src/ops/experts.cc b/src/ops/experts.cc index a1761f069d..963df195f7 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -582,18 +582,7 @@ OpMeta *Experts::init_task(Task const *task, Runtime *runtime) { Experts const *exp = (Experts *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ExpertsMeta *m = new ExpertsMeta(handle, - exp->num_experts, - exp->experts_start_idx, - exp->data_dim, - exp->out_dim, - exp->experts_num_layers, - exp->experts_internal_dim_size, - exp->effective_batch_size, - exp->num_chosen_experts, - exp->alpha, - exp->use_bias, - exp->activation); + ExpertsMeta *m = new ExpertsMeta(handle, exp); m->profiling = exp->profiling; m->inference_debugging = exp->inference_debugging; std::strcpy(m->op_name, exp->name); diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index 48536defd9..502be878a9 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -35,25 +35,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, handle_unimplemented_hip_kernel(OP_EXPERTS); } -ExpertsMeta::ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation) - : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), experts_num_layers(_experts_num_layers), - experts_internal_dim_size(_experts_internal_dim_size), - effective_batch_size(_effective_batch_size), - num_chosen_experts(_num_chosen_experts), alpha(_alpha), - use_bias(_use_bias), activation(_activation) {} +ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e) + : OpMeta(handler, e), num_experts(e->num_experts), + experts_start_idx(e->experts_start_idx), data_dim(e->data_dim), + out_dim(e->out_dim), experts_num_layers(e->experts_num_layers), + experts_internal_dim_size(e->experts_internal_dim_size), + effective_batch_size(e->effective_batch_size), + num_chosen_experts(e->num_chosen_experts), alpha(e->alpha), + use_bias(e->use_bias), activation(e->activation) {} + ExpertsMeta::~ExpertsMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 4e3ef6f12c..6f0bd8afbb 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -1233,25 +1233,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } } -ExpertsMeta::ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation) - : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), experts_num_layers(_experts_num_layers), - experts_internal_dim_size(_experts_internal_dim_size), - effective_batch_size(_effective_batch_size), - num_chosen_experts(_num_chosen_experts), alpha(_alpha), - use_bias(_use_bias), activation(_activation) { +ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e) + : OpMeta(handler, e), num_experts(e->num_experts), + experts_start_idx(e->experts_start_idx), data_dim(e->data_dim), + out_dim(e->out_dim), experts_num_layers(e->experts_num_layers), + experts_internal_dim_size(e->experts_internal_dim_size), + effective_batch_size(e->effective_batch_size), + num_chosen_experts(e->num_chosen_experts), alpha(e->alpha), + use_bias(e->use_bias), activation(e->activation) { expert_capacity = ceil(alpha * num_chosen_experts / num_experts * effective_batch_size); diff --git a/src/ops/flat.cc b/src/ops/flat.cc index 669c457709..37a86cde2a 100644 --- a/src/ops/flat.cc +++ b/src/ops/flat.cc @@ -186,7 +186,8 @@ OpMeta *Flat::init_task(Task const *task, Context ctx, Runtime *runtime) { FFHandler handler = *((FFHandler const *)task->local_args); - FlatMeta *m = new FlatMeta(handler); + Flat *flat = (Flat *)task->args; + FlatMeta *m = new FlatMeta(handler, flat); return m; } diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 50871983f5..75960e7dcd 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -264,7 +264,7 @@ OpMeta *Group_by::init_task(Task const *task, Runtime *runtime) { Group_by *gb = (Group_by *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha); + GroupByMeta *m = new GroupByMeta(handle, gb); m->profiling = gb->profiling; m->inference_debugging = gb->inference_debugging; std::strcpy(m->op_name, gb->name); @@ -565,7 +565,7 @@ bool Group_by::measure_operator_cost(Simulator *sim, } } - GroupByMeta *m = new GroupByMeta(sim->handler, n, alpha); + GroupByMeta *m = new GroupByMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index 761c35f182..9ca6f77898 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -188,9 +188,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m, data_dim); } -GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) - : OpMeta(handler), alpha(_alpha) { - checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *))); +GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb) + : OpMeta(handler, gb), alpha(gb->alpha) { + checkCUDA(hipMalloc(&dev_region_ptrs, gb->n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index 0ed09e20b3..43bcb900df 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -198,9 +198,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m, } } -GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) - : OpMeta(handler), alpha(_alpha) { - checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *))); +GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb) + : OpMeta(handler, gb), alpha(gb->alpha) { + checkCUDA(cudaMalloc(&dev_region_ptrs, gb->n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); diff --git a/src/ops/kernels/batch_matmul.cpp b/src/ops/kernels/batch_matmul.cpp index 7145af2108..8eeede65c7 100644 --- a/src/ops/kernels/batch_matmul.cpp +++ b/src/ops/kernels/batch_matmul.cpp @@ -13,13 +13,15 @@ * limitations under the License. */ +#include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {} +BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm) + : OpMeta(handler, bmm) {} namespace Kernels { namespace BatchMatmul { diff --git a/src/ops/kernels/batch_matmul.cu b/src/ops/kernels/batch_matmul.cu index ac280db1a4..97f13fa5a8 100644 --- a/src/ops/kernels/batch_matmul.cu +++ b/src/ops/kernels/batch_matmul.cu @@ -13,12 +13,14 @@ * limitations under the License. */ +#include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {} +BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm) + : OpMeta(handler, bmm) {} namespace Kernels { namespace BatchMatmul { diff --git a/src/ops/kernels/cast_kernels.cpp b/src/ops/kernels/cast_kernels.cpp index 16b9b4cec0..1e561959f1 100644 --- a/src/ops/kernels/cast_kernels.cpp +++ b/src/ops/kernels/cast_kernels.cpp @@ -14,12 +14,13 @@ */ #include "flexflow/ops/kernels/cast_kernels.h" +#include "flexflow/ops/cast.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {} +CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {} namespace Kernels { namespace Cast { diff --git a/src/ops/kernels/cast_kernels.cu b/src/ops/kernels/cast_kernels.cu index a96f37dbbd..fdce63b9f1 100644 --- a/src/ops/kernels/cast_kernels.cu +++ b/src/ops/kernels/cast_kernels.cu @@ -13,12 +13,13 @@ * limitations under the License. */ +#include "flexflow/ops/cast.h" #include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {} +CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {} namespace Kernels { namespace Cast { diff --git a/src/ops/kernels/concat_kernels.cpp b/src/ops/kernels/concat_kernels.cpp index bf5d46b9cc..6c05e0143c 100644 --- a/src/ops/kernels/concat_kernels.cpp +++ b/src/ops/kernels/concat_kernels.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/concat_kernels.h" +#include "flexflow/ops/concat.h" #include "flexflow/utils/hip_helper.h" #include @@ -23,6 +24,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Rect; +ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc) + : OpMeta(handler, cc) {} + namespace Kernels { namespace Concat { diff --git a/src/ops/kernels/concat_kernels.cu b/src/ops/kernels/concat_kernels.cu index f625560625..2569c36b21 100644 --- a/src/ops/kernels/concat_kernels.cu +++ b/src/ops/kernels/concat_kernels.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ops/concat.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/utils/cuda_helper.h" @@ -22,6 +23,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Rect; +ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc) + : OpMeta(handler, cc) {} + namespace Kernels { namespace Concat { diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp index 7d2fa20c49..b7406f641d 100644 --- a/src/ops/kernels/conv_2d_kernels.cpp +++ b/src/ops/kernels/conv_2d_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/ops/kernels/conv_2d_kernels.h" +#include "flexflow/ops/conv_2d.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) { +Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv) + : OpMeta(handler, conv) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu index 6c0fd85496..65dc38f142 100644 --- a/src/ops/kernels/conv_2d_kernels.cu +++ b/src/ops/kernels/conv_2d_kernels.cu @@ -1,9 +1,11 @@ +#include "flexflow/ops/conv_2d.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) { +Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv) + : OpMeta(handler, conv) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index 14225f0bce..c8b1887fd4 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -28,7 +28,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, Domain const &output_domain) - : OpMeta(handler) { + : OpMeta(handler, dropout) { profiling = dropout->profiling; inference_debugging = dropout->inference_debugging; checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index e142bba83b..d65b951f51 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -27,7 +27,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, Domain const &output_domain) - : OpMeta(handler) { + : OpMeta(handler, dropout) { profiling = dropout->profiling; inference_debugging = dropout->inference_debugging; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); diff --git a/src/ops/kernels/flat_kernels.cpp b/src/ops/kernels/flat_kernels.cpp index be48854fc0..6815ce7492 100644 --- a/src/ops/kernels/flat_kernels.cpp +++ b/src/ops/kernels/flat_kernels.cpp @@ -14,11 +14,15 @@ */ #include "flexflow/ops/kernels/flat_kernels.h" +#include "flexflow/ops/flat.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { +FlatMeta::FlatMeta(FFHandler handler, Flat const *flat) + : OpMeta(handler, flat) {} + namespace Kernels { namespace Flat { diff --git a/src/ops/kernels/flat_kernels.cu b/src/ops/kernels/flat_kernels.cu index 3836c02c94..fc0c0270c1 100644 --- a/src/ops/kernels/flat_kernels.cu +++ b/src/ops/kernels/flat_kernels.cu @@ -13,11 +13,15 @@ * limitations under the License. */ +#include "flexflow/ops/flat.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { +FlatMeta::FlatMeta(FFHandler handler, Flat const *flat) + : OpMeta(handler, flat) {} + namespace Kernels { namespace Flat { diff --git a/src/ops/kernels/pool_2d_kernels.cpp b/src/ops/kernels/pool_2d_kernels.cpp index 8af85612ca..b3f20a35dd 100644 --- a/src/ops/kernels/pool_2d_kernels.cpp +++ b/src/ops/kernels/pool_2d_kernels.cpp @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/pool_2d_kernels.h" +#include "flexflow/ops/pool_2d.h" #include "flexflow/utils/hip_helper.h" namespace FlexFlow { -Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) { +Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool) + : OpMeta(handler, pool) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreatePoolingDescriptor(&poolDesc)); diff --git a/src/ops/kernels/pool_2d_kernels.cu b/src/ops/kernels/pool_2d_kernels.cu index b418d20cd3..c236f049ba 100644 --- a/src/ops/kernels/pool_2d_kernels.cu +++ b/src/ops/kernels/pool_2d_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/pool_2d_kernels.h" +#include "flexflow/ops/pool_2d.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) { +Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool) + : OpMeta(handler, pool) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); diff --git a/src/ops/kernels/reshape_kernels.cpp b/src/ops/kernels/reshape_kernels.cpp index b17d95bfea..47f407fd82 100644 --- a/src/ops/kernels/reshape_kernels.cpp +++ b/src/ops/kernels/reshape_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/reshape.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {} +ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape) + : OpMeta(handler, reshape) {} namespace Kernels { namespace Reshape { diff --git a/src/ops/kernels/reshape_kernels.cu b/src/ops/kernels/reshape_kernels.cu index 9786f63815..0a2b01ae52 100644 --- a/src/ops/kernels/reshape_kernels.cu +++ b/src/ops/kernels/reshape_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/reshape.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {} +ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape) + : OpMeta(handler, reshape) {} namespace Kernels { namespace Reshape { diff --git a/src/ops/kernels/transpose_kernels.cpp b/src/ops/kernels/transpose_kernels.cpp index 49a7d827f5..199e1cd0c1 100644 --- a/src/ops/kernels/transpose_kernels.cpp +++ b/src/ops/kernels/transpose_kernels.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/transpose.h" #include "flexflow/utils/hip_helper.h" #include @@ -22,6 +23,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; +TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose) + : OpMeta(handler, transpose) {} + struct TransposeStrides { int num_dim; int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM], diff --git a/src/ops/kernels/transpose_kernels.cu b/src/ops/kernels/transpose_kernels.cu index b401ff0ba1..18a6e405af 100644 --- a/src/ops/kernels/transpose_kernels.cu +++ b/src/ops/kernels/transpose_kernels.cu @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/transpose.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -21,6 +22,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; +TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose) + : OpMeta(handler, transpose) {} + struct TransposeStrides { int num_dim; int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM], diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index e9f8feae2b..40c575532f 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -882,7 +882,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, } Domain input_domain = sub_input.get_domain(); Domain output_domain = sub_output.get_domain(); - LayerNormMeta *m = sim->layernorm_meta; + MemoryAllocator gpu_mem_allocator(sim->memory); + LayerNormMeta *m = new LayerNormMeta(sim->handler, this, gpu_mem_allocator); sim->free_all(); float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 9beb655d1d..2736dbf507 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32; LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index e242904775..b105ef0ea8 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32; LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 05529a46ec..f8181570ce 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -1219,7 +1219,10 @@ bool Linear::measure_operator_cost(Simulator *sim, int input_n = sub_input.get_volume() / input_c; int output_c = sub_output.dims[0].size; int output_n = sub_output.get_volume() / output_c; - LinearMeta *m = sim->linear_meta; + + MemoryAllocator gpu_mem_allocator(sim->memory); + LinearMeta *m = new LinearMeta( + sim->handler, output_n, this, gpu_mem_allocator, input_c * output_c); m->activation = activation; m->kernel_reg_type = kernel_reg_type; m->kernel_reg_lambda = kernel_reg_lambda; diff --git a/src/ops/mean.cc b/src/ops/mean.cc index b2ec94fdf8..0d41276735 100644 --- a/src/ops/mean.cc +++ b/src/ops/mean.cc @@ -87,8 +87,7 @@ OpMeta *Mean::init_task(Task const *task, Context ctx, Runtime *runtime) { FFHandler handler = *((FFHandler const *)task->local_args); - OpMeta *m = new OpMeta(handler); - return m; + return nullptr; } void Mean::forward(FFModel const &ff) {} diff --git a/src/ops/noop.cc b/src/ops/noop.cc index da2d4922e3..dabdf835dd 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -91,8 +91,8 @@ OpMeta *NoOp::init_task(Task const *task, Context ctx, Runtime *runtime) { FFHandler handle = *((FFHandler const *)task->local_args); - OpMeta *m = new OpMeta(handle); - return m; + // OpMeta *m = new OpMeta(handle); + return nullptr; } void NoOp::init_inference(FFModel const &ff, diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc index e358448ddf..46722bd943 100644 --- a/src/ops/pool_2d.cc +++ b/src/ops/pool_2d.cc @@ -315,7 +315,7 @@ OpMeta *Pool2D::init_task(Task const *task, assert(task->regions.size() == 2); Pool2D const *pool = (Pool2D *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Pool2DMeta *m = new Pool2DMeta(handle); + Pool2DMeta *m = new Pool2DMeta(handle, pool); m->profiling = pool->profiling; m->inference_debugging = pool->inference_debugging; std::strcpy(m->op_name, pool->name); @@ -543,7 +543,7 @@ bool Pool2D::measure_operator_cost(Simulator *sim, int output_n = sub_output.dims[3].size; int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; - Pool2DMeta *m = sim->pool2d_meta; + Pool2DMeta *m = new Pool2DMeta(sim->handler, this); init_kernel(m, input_w, diff --git a/src/ops/reduce.cpp b/src/ops/reduce.cpp index c062955ed6..fe122b13eb 100644 --- a/src/ops/reduce.cpp +++ b/src/ops/reduce.cpp @@ -25,7 +25,7 @@ using Legion::Domain; ReduceMeta::ReduceMeta(FFHandler handler, Reduce const *rd, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, rd) { checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/reduce.cu b/src/ops/reduce.cu index 65efd90e9b..1352787a12 100644 --- a/src/ops/reduce.cu +++ b/src/ops/reduce.cu @@ -24,7 +24,7 @@ using Legion::Domain; ReduceMeta::ReduceMeta(FFHandler handler, Reduce const *rd, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, rd) { checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 45da190680..04aea12c5f 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -180,7 +180,7 @@ OpMeta *Reshape::init_task(Task const *task, Runtime *runtime) { Reshape const *reshape = (Reshape *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ReshapeMeta *m = new ReshapeMeta(handle); + ReshapeMeta *m = new ReshapeMeta(handle, reshape); std::strcpy(m->op_name, reshape->name); m->layer_guid = reshape->layer_guid; m->data_type = reshape->outputs[0]->data_type; diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index f1b7a537b0..72370ab979 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -27,7 +27,7 @@ constexpr int kCUDANumThreads = 256; ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, ResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; use_two_residuals = ln->use_two_residuals; diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index e5ebdce6ed..ea77f01f53 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -26,7 +26,7 @@ constexpr int kCUDANumThreads = 256; ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, ResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; use_two_residuals = ln->use_two_residuals; diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index 7b7f30a288..0f48bf8126 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -23,7 +23,7 @@ namespace FlexFlow { SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ssm, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ssm) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; } diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 590b641b5a..ea63dd5508 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -22,7 +22,7 @@ namespace FlexFlow { SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ssm, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ssm) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; } diff --git a/src/ops/topk.cc b/src/ops/topk.cc index b38ff85f90..48da6bf341 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -223,7 +223,7 @@ OpMeta *TopK::init_task(Task const *task, Runtime *runtime) { TopK *topk = (TopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - TopKMeta *m = new TopKMeta(handle); + TopKMeta *m = new TopKMeta(handle, topk); m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; @@ -464,7 +464,7 @@ bool TopK::measure_operator_cost(Simulator *sim, return false; } - TopKMeta *m = new TopKMeta(sim->handler); + TopKMeta *m = new TopKMeta(sim->handler, this); m->sorted = sorted; // allocate diff --git a/src/ops/topk.cpp b/src/ops/topk.cpp index b6e898b654..303c6e85e9 100644 --- a/src/ops/topk.cpp +++ b/src/ops/topk.cpp @@ -513,6 +513,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m, // TODO: missing profiling here } -TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {} +TopKMeta::TopKMeta(FFHandler handler, TopK const *topk) + : OpMeta(handler, topk) {} }; // namespace FlexFlow diff --git a/src/ops/topk.cu b/src/ops/topk.cu index cc87ee8a42..cfb2bf6448 100644 --- a/src/ops/topk.cu +++ b/src/ops/topk.cu @@ -509,6 +509,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m, } } -TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {} +TopKMeta::TopKMeta(FFHandler handler, TopK const *topk) + : OpMeta(handler, topk) {} }; // namespace FlexFlow diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc index 500b7867af..bea10c9d2a 100644 --- a/src/ops/transpose.cc +++ b/src/ops/transpose.cc @@ -190,7 +190,7 @@ OpMeta *Transpose::init_task(Task const *task, Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - TransposeMeta *m = new TransposeMeta(handle); + TransposeMeta *m = new TransposeMeta(handle, transpose); transpose->init_meta(m, in_domain, out_domain); m->profiling = transpose->profiling; m->inference_debugging = transpose->inference_debugging; @@ -317,7 +317,7 @@ bool Transpose::measure_operator_cost(Simulator *sim, return false; } - TransposeMeta *m = sim->transpose_meta; + TransposeMeta *m = new TransposeMeta(sim->handler, this); this->init_meta(m, sub_input.get_domain(), sub_output.get_domain()); sim->free_all(); diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 7c266c5392..8411b42602 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -99,7 +99,7 @@ OpMeta *Combine::init_task(Task const *task, Runtime *runtime) { Combine *cmb = (Combine *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - CombineMeta *m = new CombineMeta(handle); + CombineMeta *m = new CombineMeta(handle, cmb); m->input_type[0] = cmb->inputs[0]->data_type; m->output_type[0] = cmb->outputs[0]->data_type; assert(m->input_type[0] == m->output_type[0]); diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp index 8d7e20e395..fbb11fc705 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cpp +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -20,7 +20,7 @@ namespace FlexFlow { AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace AllReduce { diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 5861f05d7a..1801ac8784 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -19,7 +19,7 @@ namespace FlexFlow { AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace AllReduce { diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp index d6e9568223..2a29be1ad4 100644 --- a/src/parallel_ops/kernels/combine_kernels.cpp +++ b/src/parallel_ops/kernels/combine_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/parallel_ops/kernels/combine_kernels.h" +#include "flexflow/parallel_ops/combine.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {} +CombineMeta::CombineMeta(FFHandler handler, Combine const *comb) + : OpMeta(handler, comb) {} namespace Kernels { namespace Combine { diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu index 1ab79a7944..5809e2d4f3 100644 --- a/src/parallel_ops/kernels/combine_kernels.cu +++ b/src/parallel_ops/kernels/combine_kernels.cu @@ -13,12 +13,14 @@ * limitations under the License. */ +#include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/kernels/combine_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {} +CombineMeta::CombineMeta(FFHandler handler, Combine const *comb) + : OpMeta(handler, comb) {} namespace Kernels { namespace Combine { diff --git a/src/parallel_ops/kernels/partition_kernels.cpp b/src/parallel_ops/kernels/partition_kernels.cpp index cfd76c0f18..bd1c96d4c7 100644 --- a/src/parallel_ops/kernels/partition_kernels.cpp +++ b/src/parallel_ops/kernels/partition_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/parallel_ops/kernels/partition_kernels.h" +#include "flexflow/parallel_ops/partition.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {} +RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart) + : OpMeta(handler, repart) {} namespace Kernels { namespace Repartition { diff --git a/src/parallel_ops/kernels/partition_kernels.cu b/src/parallel_ops/kernels/partition_kernels.cu index 08008f1035..3a39b39fe4 100644 --- a/src/parallel_ops/kernels/partition_kernels.cu +++ b/src/parallel_ops/kernels/partition_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/parallel_ops/kernels/partition_kernels.h" +#include "flexflow/parallel_ops/partition.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {} +RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart) + : OpMeta(handler, repart) {} namespace Kernels { namespace Repartition { diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp index 2a3fe5cca1..1f3e8e0962 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cpp +++ b/src/parallel_ops/kernels/reduction_kernels.cpp @@ -20,7 +20,7 @@ namespace FlexFlow { ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace Reduction { diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu index 34ae8007da..df7630976b 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cu +++ b/src/parallel_ops/kernels/reduction_kernels.cu @@ -19,7 +19,7 @@ namespace FlexFlow { ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace Reduction { diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 1647f014be..f49e0d4eb0 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -20,7 +20,7 @@ namespace FlexFlow { ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) - : OpMeta(handle) {} + : OpMeta(handle, repl) {} namespace Kernels { namespace Replicate { diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 35bc109bd3..0b5c434aa6 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -19,7 +19,7 @@ namespace FlexFlow { ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) - : OpMeta(handle) {} + : OpMeta(handle, repl) {} namespace Kernels { namespace Replicate { diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 0f71291ded..81a72a5c12 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -600,6 +600,23 @@ void FFModel::compile_inference() { assert(op->outputs[i]->parallel_tensor_guid != 0); } } + + // Check whether we need to reset input grads + // We use a parallel tensor's region as the key + std::set reset_inputs; + for (int l = operators.size() - 1; l >= 0; l--) { + Op *op = operators[l]; + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i]->region != LogicalRegion::NO_REGION); + if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) { + // We should not reset input grads since other operators have already + // saved gradients into the region + op->reset_input_grads[i] = false; + } else { + reset_inputs.insert(op->inputs[i]->region); + } + } + } // Perform fusion optimizations if (config.perform_fusion) { fprintf(stderr, "Applying fusion optimizations during compilation...\n"); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 04a847b023..82cf538f93 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1465,6 +1465,7 @@ bool Op::get_weight_parameter(TNParameter tnp, return true; } +#ifdef DEADCODE OpMeta::OpMeta(FFHandler _handle) : handle(_handle), profiling(false), inference_debugging(false) { for (int i = 0; i < MAX_NUM_INPUTS; i++) { @@ -1482,8 +1483,14 @@ OpMeta::OpMeta(FFHandler _handle) } decoding_step = 0; } +#endif -OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { +OpMeta::OpMeta(FFHandler _handle, Op const *op) + : profiling(op->profiling), inference_debugging(op->inference_debugging) { + for (int i = 0; i < op->numInputs; i++) { + trainable_inputs[i] = op->trainable_inputs[i]; + reset_input_grads[i] = op->reset_input_grads[i]; + } for (int i = 0; i < op->numInputs; i++) { input_type[i] = op->inputs[i]->data_type; } diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp index 0daf151d2c..56931e0dc7 100644 --- a/src/runtime/simulator.cpp +++ b/src/runtime/simulator.cpp @@ -82,17 +82,17 @@ Simulator::Simulator(FFModel const *model, checkCUDA(hipEventCreate(&start_event)); checkCUDA(hipEventCreate(&end_event)); - conv2d_meta = new Conv2DMeta(handler); - // linear_meta = new LinearMeta(handler, 4096); - pool2d_meta = new Pool2DMeta(handler); - ele_unary_meta = new ElementUnaryMeta(handler); - // ele_binary_meta = new ElementBinaryMeta(handler); - // embedding_meta = new EmbeddingMeta(handler); - // softmax_meta = new SoftmaxMeta(handler); - batch_matmul_meta = new BatchMatmulMeta(handler); - concat_meta = new ConcatMeta(handler); - // dropout_meta = new DropoutMeta(handler); - transpose_meta = new TransposeMeta(handler); + // conv2d_meta = new Conv2DMeta(handler); + // linear_meta = new LinearMeta(handler, 4096); + // pool2d_meta = new Pool2DMeta(handler); + // ele_unary_meta = new ElementUnaryMeta(handler); + // ele_binary_meta = new ElementBinaryMeta(handler); + // embedding_meta = new EmbeddingMeta(handler); + // softmax_meta = new SoftmaxMeta(handler); + // batch_matmul_meta = new BatchMatmulMeta(handler); + // concat_meta = new ConcatMeta(handler); + // dropout_meta = new DropoutMeta(handler); + // transpose_meta = new TransposeMeta(handler); this->machine = machine; segment_size = model->config.simulator_segment_size; max_num_segments = model->config.simulator_max_num_segments; diff --git a/src/runtime/simulator.cu b/src/runtime/simulator.cu index b44ce1690a..056781f73d 100644 --- a/src/runtime/simulator.cu +++ b/src/runtime/simulator.cu @@ -81,17 +81,17 @@ Simulator::Simulator(FFModel const *model, cudaEventCreate(&start_event); cudaEventCreate(&end_event); - conv2d_meta = new Conv2DMeta(handler); + // conv2d_meta = new Conv2DMeta(handler); // linear_meta = new LinearMeta(handler, 4096); - pool2d_meta = new Pool2DMeta(handler); - ele_unary_meta = new ElementUnaryMeta(handler); + // pool2d_meta = new Pool2DMeta(handler); + // ele_unary_meta = new ElementUnaryMeta(handler); // ele_binary_meta = new ElementBinaryMeta(handler); // embedding_meta = new EmbeddingMeta(handler); // softmax_meta = new SoftmaxMeta(handler); - batch_matmul_meta = new BatchMatmulMeta(handler); - concat_meta = new ConcatMeta(handler); + // batch_matmul_meta = new BatchMatmulMeta(handler); + // concat_meta = new ConcatMeta(handler); // dropout_meta = new DropoutMeta(handler); - transpose_meta = new TransposeMeta(handler); + // transpose_meta = new TransposeMeta(handler); this->machine = machine; segment_size = model->config.simulator_segment_size; max_num_segments = model->config.simulator_max_num_segments; @@ -103,13 +103,13 @@ Simulator::~Simulator(void) { simulatorInst.destroy(); cudaEventDestroy(start_event); cudaEventDestroy(end_event); - delete conv2d_meta; - delete pool2d_meta; - delete ele_unary_meta; - delete batch_matmul_meta; - delete concat_meta; - delete transpose_meta; - delete task_manager; + // delete conv2d_meta; + // delete pool2d_meta; + // delete ele_unary_meta; + // delete batch_matmul_meta; + // delete concat_meta; + // delete transpose_meta; + // delete task_manager; } __host__ void From eb14798b929083dd8e68a44af15132b69f00fef5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 18 Oct 2023 12:51:54 -0400 Subject: [PATCH 025/198] residual rms norm backward --- include/flexflow/model.h | 2 + .../ops/kernels/residual_rms_norm_kernels.h | 10 + include/flexflow/ops/residual_rms_norm.h | 4 + src/ops/kernels/residual_rms_norm_kernels.cu | 181 ++++++++++++++++++ src/ops/residual_rms_norm.cc | 131 ++++++++++++- 5 files changed, 327 insertions(+), 1 deletion(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index ac24e90900..30d125a542 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -171,6 +171,8 @@ enum TaskIDs { RMSNORM_PEFT_BWD_TASK_ID, RESIDUAL_RMSNORM_INIT_TASK_ID, RESIDUAL_RMSNORM_INF_TASK_ID, + RESIDUAL_RMSNORM_BWD_TASK_ID, + RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, BEAM_TOPK_INIT_TASK_ID, BEAM_TOPK_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 0eef4ca72b..26a5686f0b 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -48,6 +48,16 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &residual_output, GenericTensorAccessorW const &output); +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorR const &residual_input0, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorR const &residual_input1, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad); } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h index 0d92a236e8..11750c1f6d 100644 --- a/include/flexflow/ops/residual_rms_norm.h +++ b/include/flexflow/ops/residual_rms_norm.h @@ -74,6 +74,10 @@ class ResidualRMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 17ac14449b..75dee4808c 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -79,6 +79,23 @@ __inline__ __device__ T WarpReduceSum(T val) { return val; } +template +__inline__ __device__ T BlockReduceSum(T val, T *shared) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + template __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { int const lid = threadIdx.x % C10_WARP_SIZE; @@ -219,6 +236,170 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ T ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += dY[index] * X[index] * gamma[j]; + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast((int)N); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX1, + T *dX2) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + T dX_val = c1[i] * dY[index] * gamma[j] + c2[i] * X[index]; + dX1[index] += dX_val; + dX2[index] += dX_val; + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(ResidualRMSNormMeta const *m, + T const *output_grad_ptr, + T const *residual_output_rms_input_ptr, + T const *residual_input0_ptr, + T *residual_input0_grad_ptr, + T const *residual_input1_ptr, + T *residual_input1_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->batch_size; + const int64_t N = m->num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel + <<>>(N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr); + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBackwardCUDAKernel + <<>>(M, + N, + output_grad_ptr, + residual_output_rms_input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I): Residual input 0 + regions[3](I/O): Residual input 0 grad + regions[4](I): Residual input 1 + regions[5](I/O): Residual input 1 grad + regions[6](I): weight + regions[7](I/O): weight_grad +*/ +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorR const &residual_input0, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorR const &residual_input1, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(output_grad.data_type == residual_output_rms_input.data_type); + assert(residual_output_rms_input.data_type == residual_input0.data_type); + assert(residual_input0.data_type == residual_input0_grad.data_type); + assert(residual_input0_grad.data_type == residual_input1.data_type); + assert(residual_input1.data_type == residual_input1_grad.data_type); + assert(residual_input1_grad.data_type == weight.data_type); + assert(weight.data_type == weight_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + residual_output_rms_input.get_half_ptr(), + residual_input0.get_half_ptr(), + residual_input0_grad.get_half_ptr(), + residual_input1.get_half_ptr(), + residual_input1_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + residual_output_rms_input.get_float_ptr(), + residual_input0.get_float_ptr(), + residual_input0_grad.get_float_ptr(), + residual_input1.get_float_ptr(), + residual_input1_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index b447a2a3b5..d382f05394 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -485,8 +485,137 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, } void ResidualRMSNorm::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RESIDUAL_RMSNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): RMS output_grad + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I): residual output / RMS input + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // regions[2](I): residual input 0 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(2, FID_DATA); + // regions[3](I/O): residual input grad 0 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(3, FID_DATA); + // regions[4](I): residual input 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(4, FID_DATA); + // regions[5](I/O): residual input grad 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(5, FID_DATA); + // regions[3](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(6, FID_DATA); + // regions[4](I/O): gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(7, FID_DATA); + + runtime->execute_index_space(ctx, launcher); } + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I): Residual input 0 + regions[3](I/O): Residual input 0 grad + regions[4](I): Residual input 1 + regions[5](I/O): Residual input 1 grad + regions[6](I): weight + regions[7](I/O): weight_grad +*/ +void ResidualRMSNorm::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 8); + assert(regions.size() == 8); + ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW residual_output_rms_input = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR residual_input0 = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW residual_input0_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR residual_input1 = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + GenericTensorAccessorW residual_input1_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[5], + task->regions[5], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[6], task->regions[6], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[7], task->regions[7], FID_DATA, ctx, runtime); + backward_kernel_wrapper(m, + output_grad, + residual_output_rms_input, + residual_input0, + residual_input0_grad, + residual_input1, + residual_input1_grad, + weight, + weight_grad); +} + Op *ResidualRMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { From e7fa9cee3b97b6aa7519338b1237398a0c7d2fa1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 18 Oct 2023 16:48:30 -0400 Subject: [PATCH 026/198] cleanup --- .../ops/kernels/residual_rms_norm_kernels.h | 2 - src/ops/kernels/residual_rms_norm_kernels.cu | 14 +---- src/ops/residual_rms_norm.cc | 62 ++++++------------- 3 files changed, 22 insertions(+), 56 deletions(-) diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 26a5686f0b..75dcfc945f 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -52,9 +52,7 @@ void backward_kernel_wrapper( ResidualRMSNormMeta const *m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &residual_output_rms_input, - GenericTensorAccessorR const &residual_input0, GenericTensorAccessorW const &residual_input0_grad, - GenericTensorAccessorR const &residual_input1, GenericTensorAccessorW const &residual_input1_grad, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &weight_grad); diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 75dee4808c..2fc4cc95c2 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -290,9 +290,7 @@ template void backward_kernel(ResidualRMSNormMeta const *m, T const *output_grad_ptr, T const *residual_output_rms_input_ptr, - T const *residual_input0_ptr, T *residual_input0_grad_ptr, - T const *residual_input1_ptr, T *residual_input1_grad_ptr, T const *weight_ptr, T *weight_grad_ptr, @@ -341,9 +339,7 @@ void backward_kernel_wrapper( ResidualRMSNormMeta const *m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &residual_output_rms_input, - GenericTensorAccessorR const &residual_input0, GenericTensorAccessorW const &residual_input0_grad, - GenericTensorAccessorR const &residual_input1, GenericTensorAccessorW const &residual_input1_grad, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &weight_grad) { @@ -356,10 +352,8 @@ void backward_kernel_wrapper( cudaEventRecord(t_start, stream); } assert(output_grad.data_type == residual_output_rms_input.data_type); - assert(residual_output_rms_input.data_type == residual_input0.data_type); - assert(residual_input0.data_type == residual_input0_grad.data_type); - assert(residual_input0_grad.data_type == residual_input1.data_type); - assert(residual_input1.data_type == residual_input1_grad.data_type); + assert(residual_output_rms_input.data_type == residual_input0_grad.data_type); + assert(residual_input0_grad.data_type == residual_input1_grad.data_type); assert(residual_input1_grad.data_type == weight.data_type); assert(weight.data_type == weight_grad.data_type); @@ -367,9 +361,7 @@ void backward_kernel_wrapper( backward_kernel(m, output_grad.get_half_ptr(), residual_output_rms_input.get_half_ptr(), - residual_input0.get_half_ptr(), residual_input0_grad.get_half_ptr(), - residual_input1.get_half_ptr(), residual_input1_grad.get_half_ptr(), weight.get_half_ptr(), weight_grad.get_half_ptr(), @@ -378,9 +370,7 @@ void backward_kernel_wrapper( backward_kernel(m, output_grad.get_float_ptr(), residual_output_rms_input.get_float_ptr(), - residual_input0.get_float_ptr(), residual_input0_grad.get_float_ptr(), - residual_input1.get_float_ptr(), residual_input1_grad.get_float_ptr(), weight.get_float_ptr(), weight_grad.get_float_ptr(), diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index d382f05394..1e0b652163 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -511,48 +511,34 @@ void ResidualRMSNorm::backward(FFModel const &ff) { EXCLUSIVE, outputs[0]->region)); launcher.add_field(1, FID_DATA); - // regions[2](I): residual input 0 - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(2, FID_DATA); - // regions[3](I/O): residual input grad 0 + // regions[2](I/O): residual input grad 0 launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, inputs[0]->region_grad)); - launcher.add_field(3, FID_DATA); - // regions[4](I): residual input 1 - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[1]->region)); - launcher.add_field(4, FID_DATA); - // regions[5](I/O): residual input grad 1 + launcher.add_field(2, FID_DATA); + // regions[3](I/O): residual input grad 1 launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, inputs[1]->region_grad)); - launcher.add_field(5, FID_DATA); - // regions[3](I): gamma + launcher.add_field(3, FID_DATA); + // regions[4](I): gamma launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(6, FID_DATA); - // regions[4](I/O): gamma_grad + launcher.add_field(4, FID_DATA); + // regions[5](I/O): gamma_grad launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, weights[0]->region_grad)); - launcher.add_field(7, FID_DATA); + launcher.add_field(5, FID_DATA); runtime->execute_index_space(ctx, launcher); } @@ -560,19 +546,17 @@ void ResidualRMSNorm::backward(FFModel const &ff) { /* regions[0](I): RMS output_grad regions[1](I): Residual output / RMS input - regions[2](I): Residual input 0 - regions[3](I/O): Residual input 0 grad - regions[4](I): Residual input 1 - regions[5](I/O): Residual input 1 grad - regions[6](I): weight - regions[7](I/O): weight_grad + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad */ void ResidualRMSNorm::backward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(task->regions.size() == 8); - assert(regions.size() == 8); + assert(task->regions.size() == 6); + assert(regions.size() == 6); ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -583,34 +567,28 @@ void ResidualRMSNorm::backward_task(Task const *task, FID_DATA, ctx, runtime); - GenericTensorAccessorR residual_input0 = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorW residual_input0_grad = helperGetGenericTensorAccessorRW(m->input_type[0], - regions[3], - task->regions[3], + regions[2], + task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR residual_input1 = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); GenericTensorAccessorW residual_input1_grad = helperGetGenericTensorAccessorRW(m->input_type[0], - regions[5], - task->regions[5], + regions[3], + task->regions[3], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[6], task->regions[6], FID_DATA, ctx, runtime); + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW( - m->weight_type[0], regions[7], task->regions[7], FID_DATA, ctx, runtime); + m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime); backward_kernel_wrapper(m, output_grad, residual_output_rms_input, - residual_input0, residual_input0_grad, - residual_input1, residual_input1_grad, weight, weight_grad); From 5f7f71082b24b324412e6456fee031d8fa94d223 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 18 Oct 2023 18:03:05 -0400 Subject: [PATCH 027/198] bug fix --- src/runtime/request_manager.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index bdb87df051..c0573a50a3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1214,7 +1214,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = request.tokens.size() - 1; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.num_tokens++; new_bc.requestsInfo[i].num_tokens_in_batch++; From 7b2bd0874b67f87a4e3c724f93c3054f6475770f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 19 Oct 2023 00:16:18 -0400 Subject: [PATCH 028/198] finished peft bwd for residual rms norm --- .../ops/kernels/residual_rms_norm_kernels.h | 15 ++ include/flexflow/ops/residual_rms_norm.h | 9 + src/ops/kernels/residual_rms_norm_kernels.cu | 203 +++++++++++++++++- src/ops/layer_norm.cc | 2 +- src/ops/layer_norm.cu | 73 ++++--- src/ops/residual_rms_norm.cc | 90 +++++++- src/runtime/model.cc | 30 +++ 7 files changed, 379 insertions(+), 43 deletions(-) diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 75dcfc945f..4fbe34f83f 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_OPS_KERNELS_RESIDUAL_RMSNORM_KERNELS_H #include "flexflow/accessor.h" +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -38,6 +39,8 @@ class ResidualRMSNormMeta : public OpMeta { int batch_size; int num_elements; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; }; namespace Kernels { @@ -48,6 +51,13 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &residual_output, GenericTensorAccessorW const &output); +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output); void backward_kernel_wrapper( ResidualRMSNormMeta const *m, GenericTensorAccessorR const &output_grad, @@ -56,6 +66,11 @@ void backward_kernel_wrapper( GenericTensorAccessorW const &residual_input1_grad, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &weight_grad); +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight); } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h index 11750c1f6d..de6e6ea506 100644 --- a/include/flexflow/ops/residual_rms_norm.h +++ b/include/flexflow/ops/residual_rms_norm.h @@ -44,6 +44,11 @@ class ResidualRMSNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -78,6 +83,10 @@ class ResidualRMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 2fc4cc95c2..53804c0b1b 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -236,6 +236,116 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, } } +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input1.data_type == input2.data_type); + assert(output.data_type == input1.data_type); + assert(weight.data_type == output.data_type); + assert(residual_output.data_type == output.data_type); + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + residual_output.get_float_ptr() + + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + residual_output.get_half_ptr() + + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + template __global__ void ComputeInternalGradientsCUDAKernel( int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { @@ -325,15 +435,44 @@ void backward_kernel(ResidualRMSNormMeta const *m, weight_grad_ptr); } +template +void peft_bwd_kernel(ResidualRMSNormMeta const *m, + T const *output_grad_ptr, + T *residual_input0_grad_ptr, + T *residual_input1_grad_ptr, + T const *weight_ptr, + cudaStream_t stream) { + const int64_t M = m->batch_size; + const int64_t N = m->num_elements; + T const *residual_output_rms_input_ptr = + static_cast(m->input_activation); + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel + <<>>(N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr); +} + /* regions[0](I): RMS output_grad regions[1](I): Residual output / RMS input - regions[2](I): Residual input 0 - regions[3](I/O): Residual input 0 grad - regions[4](I): Residual input 1 - regions[5](I/O): Residual input 1 grad - regions[6](I): weight - regions[7](I/O): weight_grad + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad */ void backward_kernel_wrapper( ResidualRMSNormMeta const *m, @@ -390,6 +529,58 @@ void backward_kernel_wrapper( } } +/* + regions[0](I): RMS output_grad + regions[1](I/O): Residual input 0 grad + regions[2](I/O): Residual input 1 grad + regions[3](I): weight +*/ +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(output_grad.data_type == residual_input0_grad.data_type); + assert(residual_input0_grad.data_type == residual_input1_grad.data_type); + assert(residual_input1_grad.data_type == weight.data_type); + + if (output_grad.data_type == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + residual_input0_grad.get_half_ptr(), + residual_input1_grad.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + residual_input0_grad.get_float_ptr(), + residual_input1_grad.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 40c575532f..0a467f0984 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -668,7 +668,7 @@ Legion::FutureMap Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); // regions[0](I): output_grad launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index b105ef0ea8..6e12c53230 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -261,53 +261,56 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, // save input activation if needed for PEFT if (bc->num_active_peft_tokens() > 0) { - // check that at most one dimension after the first is > 1. TODO(goliaro): - // support case where this condition does not hold - int non_unit_dims_encountered = 0; - for (int i = 1; i < input.domain.get_dim(); i++) { - int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1; - if (dim_i > 1) { - non_unit_dims_encountered++; + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; } } - assert(non_unit_dims_encountered <= 1); - - // allocate space for all peft tokens - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * bc->num_active_peft_tokens() * - in_dim); + assert(num_peft_requests <= 1); int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } - // Skip non-PEFT requests and PEFT forward-only requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || - !bc->requestsInfo[i].peft_bwd) { + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - - if (m->input_type[0] == DT_FLOAT) { - checkCUDA(cudaMemcpyAsync( - m->input_activation, - input.get_float_ptr() + tokens_previous_requests * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, - cudaMemcpyDeviceToDevice, - stream)); - } else if (m->input_type[0] == DT_HALF) { - checkCUDA(cudaMemcpyAsync( - m->input_activation, - input.get_half_ptr() + tokens_previous_requests * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, - cudaMemcpyDeviceToDevice, - stream)); - } else { - assert(false && "unsupport datatype in layernorm"); + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } } } } diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 1e0b652163..07137726d1 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -444,7 +444,8 @@ void ResidualRMSNorm::inference_task(Task const *task, m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input1, input2, weight, residual_output, output); + inference_kernel_wrapper( + m, bc, input1, input2, weight, residual_output, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -594,6 +595,93 @@ void ResidualRMSNorm::backward_task(Task const *task, weight_grad); } +Legion::FutureMap + ResidualRMSNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + // regions[0](I): RMS output_grad + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + // regions[2](I/O): residual input grad 0 + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // regions[3](I/O): residual input grad 1 + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(2, FID_DATA); + // regions[4](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): RMS output_grad + regions[1](I/O): Residual input 0 grad + regions[2](I/O): Residual input 1 grad + regions[3](I): weight +*/ +void ResidualRMSNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 4); + assert(regions.size() == 4); + ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW residual_input0_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_input1_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + peft_bwd_kernel_wrapper( + m, output_grad, residual_input0_grad, residual_input1_grad, weight); +} + Op *ResidualRMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 82cf538f93..a1b5b07d8d 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5433,6 +5433,36 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_BWD_TASK_ID, + "Residual RMS Norm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, + "Residual RMS Norm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID, "layernorm_peft_bwd_task"); From d2f177d36af88254ae9f40df0098cf07a49aa222 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 19 Oct 2023 15:38:40 -0400 Subject: [PATCH 029/198] sigmoid_silu_multi backward and peft_bwd --- include/flexflow/model.h | 2 + include/flexflow/ops/sigmoid_silu_multi.h | 32 ++- src/ops/fused.cu | 3 +- src/ops/sigmoid_silu_multi.cc | 170 +++++++++++++++- src/ops/sigmoid_silu_multi.cu | 232 +++++++++++++++++++++- src/runtime/model.cc | 32 +++ 6 files changed, 466 insertions(+), 5 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 30d125a542..4e863952cc 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -113,6 +113,8 @@ enum TaskIDs { ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, SIGMOID_SILU_MULTI_INIT_TASK_ID, SIGMOID_SILU_MULTI_INF_TASK_ID, + SIGMOID_SILU_MULTI_BWD_TASK_ID, + SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, LINEAR_INF_TASK_ID, diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h index 604438260a..28e3bfed3e 100644 --- a/include/flexflow/ops/sigmoid_silu_multi.h +++ b/include/flexflow/ops/sigmoid_silu_multi.h @@ -1,5 +1,6 @@ #pragma once +#include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/utils/memory_allocator.h" @@ -27,6 +28,11 @@ class SigmoidSiluMulti : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, @@ -55,6 +61,14 @@ class SigmoidSiluMulti : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -65,10 +79,24 @@ class SigmoidSiluMulti : public Op { T const *input2_ptr, T *output_ptr, ffStream_t stream); - static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m, + static void inference_kernel_wrapper(SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output); + static void + backward_kernel_wrapper(SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad); + static void + peft_bwd_kernel_wrapper(SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad); }; class SigmoidSiluMultiMeta : public OpMeta { @@ -80,6 +108,8 @@ class SigmoidSiluMultiMeta : public OpMeta { public: Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; }; }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 692316c6d4..b9ce88e02c 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -617,8 +617,9 @@ __host__ void case OP_SIGMOID_SILU_MULTI: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op]; SigmoidSiluMulti::inference_kernel_wrapper(m, + bc, my_input_accessor[0], my_input_accessor[1], my_output_accessor[0]); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3b2ed7cef4..e36eb36d31 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -254,7 +254,173 @@ void SigmoidSiluMulti::forward(FFModel const &ff) { } void SigmoidSiluMulti::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(SIGMOID_SILU_MULTI_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // output grad + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // input 1 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // input 2 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(2, FID_DATA); + // input 1 grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(3, FID_DATA); + // input 2 grad + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(4, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output grad + regions[1](I): input 1 + regions[2](I): input 2 + regions[3](I/O): input 1 grad + regions[4](I/O): input 2 grad +*/ +void SigmoidSiluMulti::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 5); + + SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( + m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime); + + SigmoidSiluMulti::backward_kernel_wrapper( + m, output_grad, input1, input2, input1_grad, input2_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + SigmoidSiluMulti::save_inference_tensors_to_file( + m, + shard_id, + nullptr, + {output_grad, input1, input2}, + {}, + {input1_grad, input2_grad}); + } +} + +FutureMap + SigmoidSiluMulti::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // output grad + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(0, FID_DATA); + // input 1 grad + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // input 2 grad + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output grad + regions[3](I/O): input 1 grad + regions[4](I/O): input 2 grad +*/ +void SigmoidSiluMulti::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 3); + + SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() <= 0) { + return; + } + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + SigmoidSiluMulti::peft_bwd_kernel_wrapper( + m, bc, output_grad, input1_grad, input2_grad); } FutureMap SigmoidSiluMulti::inference( @@ -347,7 +513,7 @@ void SigmoidSiluMulti::inference_task( assert(input1_domain == input2_domain); assert(input1_domain == output_domain); - SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output); + SigmoidSiluMulti::inference_kernel_wrapper(m, bc, input1, input2, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index ea63dd5508..597f7ecdab 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -45,9 +45,34 @@ __global__ void SigmoidSiluMultiKernel(int num_elements, } } +template +__global__ void SigmoidSiluMultiBackwardKernel(int num_elements, + T const *output_grad_ptr, + T const *input1_ptr, + T const *input2_ptr, + T *input1_grad_ptr, + T *input2_grad_ptr) { + CUDA_KERNEL_LOOP(i, num_elements) { + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + + T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; + input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val); + + input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + T sig_grad = ss_grad_val * input1_ptr[i]; + + float x1_grad_val = static_cast(sig_grad); + x1_grad_val = exp(-x1_grad_val) / + ((1.0f + exp(-sigmoid_val)) * (1.0f + exp(-sigmoid_val))); + input1_grad_ptr[i] += T(x1_grad_val); + } +} + /*static*/ void SigmoidSiluMulti::inference_kernel_wrapper( - SigmoidSiluMultiMeta const *m, + SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output) { @@ -64,6 +89,77 @@ void SigmoidSiluMulti::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + size_t input_tensor_size = + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; + m->input_activation = + allocator->allocate_instance_untyped(2 * input_tensor_size); + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + input1.get_float_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + checkCUDA(cudaMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_float_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + input1.get_half_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + checkCUDA(cudaMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_half_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->input_type[0] == DT_FLOAT) { SigmoidSiluMultiKernel<<profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + SigmoidSiluMultiBackwardKernel<<>>(output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + input1.get_float_ptr(), + input2.get_float_ptr(), + input1_grad.get_float_ptr(), + input1_grad.get_float_ptr()); + } else if (m->input_type[0] == DT_HALF) { + SigmoidSiluMultiBackwardKernel<<>>(output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + input1.get_half_ptr(), + input2.get_half_ptr(), + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr()); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed); + } +} + +/*static*/ +void SigmoidSiluMulti::peft_bwd_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int num_elements = output_grad.domain.get_volume(); + assert(input1_grad.domain.get_volume() == num_elements); + assert(input2_grad.domain.get_volume() == num_elements); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + int num_peft_requests = 0; + int num_peft_tokens = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + } + } + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + + if (m->input_type[0] == DT_FLOAT) { + SigmoidSiluMultiBackwardKernel<<>>( + output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_float_ptr(), + input1_grad.get_float_ptr()); + } else if (m->input_type[0] == DT_HALF) { + SigmoidSiluMultiBackwardKernel<<>>( + output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr()); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index a1b5b07d8d..3ab1049f4a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5328,6 +5328,38 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_BWD_TASK_ID, + "SigmoidSiluMulti Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, + "SigmoidSiluMulti PEFT Bwd"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti PEFT Bwd Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // rms norm task { TaskVariantRegistrar registrar(RMSNORM_INIT_TASK_ID, "rmsnorm_init_task"); From 8b1f76b2c03652d1fb0d977ef488df2613d6f79b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 19 Oct 2023 15:54:00 -0400 Subject: [PATCH 030/198] hip_rocm update --- src/ops/sigmoid_silu_multi.cpp | 263 ++++++++++++++++++++++++++++++--- 1 file changed, 242 insertions(+), 21 deletions(-) diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index 0f48bf8126..ccd622ff17 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -34,36 +34,46 @@ SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { } } -__device__ __forceinline__ float sigmoid_float(float x) { - return 1.0 / (1.0 + expf(-x)); -} - -__device__ __forceinline__ half sigmoid_half(half x) { - return (half)1.0 / ((half)1.0 + hexp(-x)); -} - -__global__ void SigmoidSiluMultiKernelFloat(int num_elements, - float const *input1_ptr, - float const *input2_ptr, - float *output_ptr) { +template +__global__ void SigmoidSiluMultiKernel(int num_elements, + T const *input1_ptr, + T const *input2_ptr, + T *output_ptr) { CUDA_KERNEL_LOOP(i, num_elements) { - output_ptr[i] = - input1_ptr[i] * sigmoid_float(input1_ptr[i]) * input2_ptr[i]; + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + output_ptr[i] = input1_ptr[i] * T(sigmoid_val) * input2_ptr[i]; } } -__global__ void SigmoidSiluMultiKernelHalf(int num_elements, - half const *input1_ptr, - half const *input2_ptr, - half *output_ptr) { +template +__global__ void SigmoidSiluMultiBackwardKernel(int num_elements, + T const *output_grad_ptr, + T const *input1_ptr, + T const *input2_ptr, + T *input1_grad_ptr, + T *input2_grad_ptr) { CUDA_KERNEL_LOOP(i, num_elements) { - output_ptr[i] = input1_ptr[i] * sigmoid_half(input1_ptr[i]) * input2_ptr[i]; + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + + T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; + input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val); + + input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + T sig_grad = ss_grad_val * input1_ptr[i]; + + float x1_grad_val = static_cast(sig_grad); + x1_grad_val = exp(-x1_grad_val) / + ((1.0f + exp(-sigmoid_val)) * (1.0f + exp(-sigmoid_val))); + input1_grad_ptr[i] += T(x1_grad_val); } } /*static*/ void SigmoidSiluMulti::inference_kernel_wrapper( SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output) { @@ -81,8 +91,78 @@ void SigmoidSiluMulti::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + size_t input_tensor_size = + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; + m->input_activation = + allocator->allocate_instance_untyped(2 * input_tensor_size); + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync(m->input_activation, + input1.get_float_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + checkCUDA(hipMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_float_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync(m->input_activation, + input1.get_half_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + checkCUDA(hipMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_half_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->input_type[0] == DT_FLOAT) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelFloat), + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel), GET_BLOCKS(num_elements), min(CUDA_NUM_THREADS, num_elements), 0, @@ -92,7 +172,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( input2.get_float_ptr(), output.get_float_ptr()); } else if (m->input_type[0] == DT_HALF) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelHalf), + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel), GET_BLOCKS(num_elements), min(CUDA_NUM_THREADS, num_elements), 0, @@ -116,4 +196,145 @@ void SigmoidSiluMulti::inference_kernel_wrapper( } } +/*static*/ +void SigmoidSiluMulti::backward_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int num_elements = output_grad.domain.get_volume(); + assert(input1.domain.get_volume() == num_elements); + assert(input2.domain.get_volume() == num_elements); + assert(input1_grad.domain.get_volume() == num_elements); + assert(input2_grad.domain.get_volume() == num_elements); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->input_type[0] == DT_FLOAT) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + input1.get_float_ptr(), + input2.get_float_ptr(), + input1_grad.get_float_ptr(), + input1_grad.get_float_ptr()); + } else if (m->input_type[0] == DT_HALF) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + input1.get_half_ptr(), + input2.get_half_ptr(), + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr()); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed); + } +} + +/*static*/ +void SigmoidSiluMulti::peft_bwd_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int num_elements = output_grad.domain.get_volume(); + assert(input1_grad.domain.get_volume() == num_elements); + assert(input2_grad.domain.get_volume() == num_elements); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + int num_peft_requests = 0; + int num_peft_tokens = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + } + } + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + + if (m->input_type[0] == DT_FLOAT) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_float_ptr(), + input1_grad.get_float_ptr()); + } else if (m->input_type[0] == DT_HALF) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr()); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed); + } +} + }; // namespace FlexFlow From 84c391bfd0f9c026c43e933c0c5915a84d43119f Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 20 Oct 2023 00:31:35 -0400 Subject: [PATCH 031/198] support peft_bwd for fused layers --- config/config.linux | 2 +- src/ops/fused.cu | 31 ++++++++++++++----------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/config/config.linux b/config/config.linux index dbf3d3dd01..3686237538 100755 --- a/config/config.linux +++ b/config/config.linux @@ -13,7 +13,7 @@ #INSTALL_DIR= # set build type -BUILD_TYPE=${BUILD_TYPE:-Debug} +BUILD_TYPE=${BUILD_TYPE:-Release} INFERENCE_TESTS=${INFERENCE_TESTS:-OFF} LIBTORCH_PATH=${LIBTORCH_PATH:-"$(realpath ../..)/libtorch"} diff --git a/src/ops/fused.cu b/src/ops/fused.cu index b9ce88e02c..eaf1831beb 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1007,18 +1007,16 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, } case OP_RESIDUAL_RMS_NORM: { // TODO: implement me - assert(false); assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); - // ResidualRMSNormMeta const *m = (ResidualRMSNormMeta - // *)metas->meta[op]; - // Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - // my_input_accessor[0], - // my_input_accessor[1], - // my_weight_accessor[0], - // my_output_accessor[0], - // my_output_accessor[1]); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta*)metas->meta[op]; + Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( + m, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + my_weight_accessor[0]); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -1152,14 +1150,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, case OP_SIGMOID_SILU_MULTI: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); - // SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta - // *)metas->meta[op]; - // TODO: implement me - assert(false); - // SigmoidSiluMulti::inference_kernel_wrapper(m, - // my_input_accessor[0], - // my_input_accessor[1], - // my_output_accessor[0]); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta*)metas->meta[op]; + SigmoidSiluMulti::peft_bwd_kernel_wrapper( + m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_input_grad_accessor[1]); break; } case OP_SOFTMAX: { From 1cc723e3dfa0d9448a9d10224fb0927264b82292 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 20 Oct 2023 00:32:11 -0400 Subject: [PATCH 032/198] format --- src/ops/fused.cu | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index eaf1831beb..3030b23830 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1010,7 +1010,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta*)metas->meta[op]; + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( m, my_output_grad_accessor[0], @@ -1150,13 +1150,12 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, case OP_SIGMOID_SILU_MULTI: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta*)metas->meta[op]; - SigmoidSiluMulti::peft_bwd_kernel_wrapper( - m, - bc, - my_output_grad_accessor[0], - my_input_grad_accessor[0], - my_input_grad_accessor[1]); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_input_grad_accessor[1]); break; } case OP_SOFTMAX: { From f1d5dc0ba0e66c5fc3a263a590dee6ccbe33f253 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 21 Oct 2023 06:21:36 -0400 Subject: [PATCH 033/198] residual layer norm bwd / peft_bwd --- include/flexflow/model.h | 4 + include/flexflow/ops/residual_layer_norm.h | 36 +- src/ops/fused.cu | 4 +- src/ops/residual_layer_norm.cc | 293 ++++++++++- src/ops/residual_layer_norm.cu | 556 ++++++++++++++++++++- src/ops/residual_rms_norm.cc | 8 +- src/runtime/model.cc | 32 ++ 7 files changed, 923 insertions(+), 10 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 4e863952cc..5d986c1329 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -109,8 +109,12 @@ enum TaskIDs { LAYERNORM_PEFT_BWD_TASK_ID, RESIDUAL_LAYERNORM_INIT_TASK_ID, RESIDUAL_LAYERNORM_INF_TASK_ID, + RESIDUAL_LAYERNORM_BWD_TASK_ID, + RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, SIGMOID_SILU_MULTI_INIT_TASK_ID, SIGMOID_SILU_MULTI_INF_TASK_ID, SIGMOID_SILU_MULTI_BWD_TASK_ID, diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index 0e9be82125..35ddb171d4 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -40,6 +40,11 @@ class ResidualLayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -65,6 +70,14 @@ class ResidualLayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -78,7 +91,8 @@ class ResidualLayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void inference_kernel_wrapper(ResidualLayerNormMeta const *m, + static void inference_kernel_wrapper(ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -86,6 +100,24 @@ class ResidualLayerNorm : public Op { GenericTensorAccessorW &output, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta); + static void + backward_kernel_wrapper(ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + + static void + peft_bwd_kernel_wrapper(ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias, use_two_residuals; @@ -107,6 +139,8 @@ class ResidualLayerNormMeta : public OpMeta { float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; }; }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 3030b23830..255136099a 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -538,8 +538,7 @@ __host__ void } case OP_RESIDUAL_LAYERNORM: { assert(fused->op_num_outputs[op] == 2); - ResidualLayerNormMeta const *m = - (ResidualLayerNormMeta *)metas->meta[op]; + ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op]; if (m->use_two_residuals) { assert(fused->op_num_inputs[op] == 3); } else { @@ -566,6 +565,7 @@ __host__ void } } ResidualLayerNorm::inference_kernel_wrapper(m, + bc, my_input_accessor[0], my_input_accessor[1], residual2, diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 7de40fb389..ce82ec6702 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -516,7 +516,296 @@ void ResidualLayerNorm::forward(FFModel const &ff) { } void ResidualLayerNorm::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RESIDUAL_LAYERNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // added output + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_two_residuals) { + // residual grad 2 + launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_bias) { + // beta_grad + launcher.add_region_requirement( + RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + } + runtime->execute_index_space(ctx, launcher); +} + +void ResidualLayerNorm::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + ResidualLayerNormMeta const *m = + *((ResidualLayerNormMeta **)task->local_args); + assert(regions.size() == + 4 + m->use_two_residuals + + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR added_output = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual1_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual2_grad; + if (m->use_two_residuals) { + GenericTensorAccessorW residual2_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + gamma_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + if (m->use_bias) { + beta_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + } + ResidualLayerNorm::backward_kernel_wrapper(m, + output_grad, + added_output, + input_grad, + residual1_grad, + residual2_grad, + gamma, + gamma_grad, + beta_grad); +} + +Legion::FutureMap ResidualLayerNorm::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_two_residuals) { + // residual grad 2 + launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +void ResidualLayerNorm::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + ResidualLayerNormMeta const *m = + *((ResidualLayerNormMeta **)task->local_args); + assert(regions.size() == + 4 + m->use_two_residuals + + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual1_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual2_grad; + if (m->use_two_residuals) { + GenericTensorAccessorW residual2_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + ResidualLayerNorm::peft_bwd_kernel_wrapper( + m, output_grad, input_grad, residual1_grad, residual2_grad, gamma); } Op *ResidualLayerNorm::materialize(FFModel &ff, @@ -734,7 +1023,7 @@ void ResidualLayerNorm::inference_task( m->effective_num_elements * m->effective_batch_size); ResidualLayerNorm::inference_kernel_wrapper( - m, input, residual1, residual2, added_output, output, gamma, beta); + m, bc, input, residual1, residual2, added_output, output, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index ea77f01f53..4bfac1887f 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -22,6 +22,7 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, ResidualLayerNorm const *ln, @@ -73,6 +74,23 @@ __inline__ __device__ T WarpReduceSum(T val) { return val; } +template +__inline__ __device__ T BlockReduceSum(T val, T *shared) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + template __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { int const lid = threadIdx.x % C10_WARP_SIZE; @@ -186,7 +204,8 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, /*static*/ void ResidualLayerNorm::inference_kernel_wrapper( - ResidualLayerNormMeta const *m, + ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -203,6 +222,63 @@ void ResidualLayerNorm::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = + added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->input_type[0] == DT_FLOAT) { ResidualLayerNorm::inference_kernel( m, @@ -240,4 +316,482 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual1_i = dX_residual1 + i1 * N; + T *dX_residual2_i = + (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] += f_grad_input; + dX_residual1_i[l] += f_grad_input; + if (dX_residual2 != nullptr) { + dX_residual2_i[l] += f_grad_input; + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf); +} + +/*static*/ +template +void backward_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void ResidualLayerNorm::backward_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void peft_bwd_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + N); +} + +/*static*/ +void ResidualLayerNorm::peft_bwd_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 07137726d1..e2bc29635a 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -499,11 +499,11 @@ void ResidualRMSNorm::backward(FFModel const &ff) { 0 /*mapper_id*/, outputs[0]->machine_view.hash()); // regions[0](I): RMS output_grad - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - outputs[0]->region_grad)); + outputs[1]->region_grad)); launcher.add_field(0, FID_DATA); // regions[1](I): residual output / RMS input launcher.add_region_requirement(RegionRequirement(outputs[0]->part, @@ -617,11 +617,11 @@ Legion::FutureMap 0 /*mapper_id*/, machine_view_hash); // regions[0](I): RMS output_grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_outputs[0]->region)); + batch_outputs[1]->region)); launcher.add_field(0, FID_DATA); // regions[2](I/O): residual input grad 0 launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 3ab1049f4a..8939e9e74d 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5259,6 +5259,38 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_BWD_TASK_ID, + "residual_layernorm_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_backward_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + "residual_layernorm_peft_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_peft_bwd_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // AddBiasResidualLayerNorm task { TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, From 3b50e17b7964ac920511df2f12e06a2de6f766ca Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 21 Oct 2023 20:18:25 -0400 Subject: [PATCH 034/198] fix typo --- src/ops/kernels/conv_2d_kernels.cpp | 6 +++--- src/ops/kernels/conv_2d_kernels.cu | 6 +++--- src/ops/kernels/linear_kernels.cpp | 8 ++++---- src/ops/kernels/linear_kernels.cu | 8 ++++---- src/ops/kernels/lora_linear_kernels.cu | 8 ++++---- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp index b7406f641d..85a94ad6be 100644 --- a/src/ops/kernels/conv_2d_kernels.cpp +++ b/src/ops/kernels/conv_2d_kernels.cpp @@ -328,7 +328,7 @@ void backward_kernel(Conv2DMeta const *m, output_ptr, n * c * h * w); } - // Compute filter gradiant + // Compute filter gradient // NOTE: we use alpha for kernel_grad to accumulate gradients checkCUDNN(miopenConvolutionBackwardWeights(m->handle.dnn, &alpha, @@ -343,7 +343,7 @@ void backward_kernel(Conv2DMeta const *m, kernel_grad_ptr, m->handle.workSpace, m->handle.workSpaceSize)); - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { checkCUDNN(miopenConvolutionBackwardBias(m->handle.dnn, @@ -354,7 +354,7 @@ void backward_kernel(Conv2DMeta const *m, m->biasTensor, bias_grad_ptr)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDNN(miopenConvolutionBackwardData(m->handle.dnn, diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu index 65dc38f142..661acdf732 100644 --- a/src/ops/kernels/conv_2d_kernels.cu +++ b/src/ops/kernels/conv_2d_kernels.cu @@ -311,7 +311,7 @@ void backward_kernel(Conv2DMeta const *m, reluBackward<<>>( output_grad_ptr, output_ptr, n * c * h * w); } - // Compute filter gradiant + // Compute filter gradient // NOTE: we use alpha for kernel_grad to accumulate gradients checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn, &alpha, @@ -326,7 +326,7 @@ void backward_kernel(Conv2DMeta const *m, &alpha, m->filterDesc, kernel_grad_ptr)); - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn, @@ -337,7 +337,7 @@ void backward_kernel(Conv2DMeta const *m, m->biasTensor, bias_grad_ptr)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn, diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 87b39126c5..e92cc77f3a 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -393,7 +393,7 @@ void peft_bwd_kernel(LinearMeta const *m, assert(m->activation == AC_MODE_NONE); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(hipblasGemmEx(m->handle.blas, @@ -455,7 +455,7 @@ void backward_kernel(LinearMeta const *m, // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); } - // Compute weight gradiant + // Compute weight gradient // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_N, @@ -476,7 +476,7 @@ void backward_kernel(LinearMeta const *m, in_dim, compute_type, HIPBLAS_GEMM_DEFAULT)); - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha=1 for bias_grad to accumulate gradients // use_bias = True if (bias_grad_ptr != NULL) { @@ -500,7 +500,7 @@ void backward_kernel(LinearMeta const *m, compute_type, HIPBLAS_GEMM_DEFAULT)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(hipblasGemmEx(m->handle.blas, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index edf3cdaf07..0aa6661187 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -479,7 +479,7 @@ void peft_bwd_kernel(LinearMeta const *m, assert(m->activation == AC_MODE_NONE); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, @@ -542,7 +542,7 @@ void backward_kernel(LinearMeta const *m, // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); } - // Compute weight gradiant + // Compute weight gradient // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, @@ -583,7 +583,7 @@ void backward_kernel(LinearMeta const *m, assert(false && "Only L2 regularization is supported"); } - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha=1 for bias_grad to accumulate gradients // use_bias = True if (bias_grad_ptr != NULL) { @@ -607,7 +607,7 @@ void backward_kernel(LinearMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index eab98a24e7..a3fc071f11 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -295,7 +295,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, LoraLinearWeight weight = m->model_weights[bc->requestsInfo[i].peft_model_id]; int rank = weight.rank; - // Compute w1's gradiant + // Compute w1's gradient // NOTE: we use alpha=1 for w1_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, @@ -316,7 +316,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Compute gradiants w.r.t. low_rank activation + // Compute gradients w.r.t. low_rank activation // and save the results to low_rank_activation // NOTE: we use alpha=1 for input_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, @@ -338,7 +338,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Compute w0's gradiant + // Compute w0's gradient // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, @@ -359,7 +359,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, in_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // Compute input gradiant + // Compute input gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != nullptr) { checkCUDA(cublasGemmEx(m->handle.blas, From bdb590b3cb8e8e132856f75438bf155745480b91 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 22 Oct 2023 00:11:33 -0400 Subject: [PATCH 035/198] add_bias_residual_layer_norm backward and peft_bwd --- .../ops/add_bias_residual_layer_norm.h | 60 +- src/ops/add_bias_residual_layer_norm.cc | 354 +++++++++-- src/ops/add_bias_residual_layer_norm.cpp | 595 +++++++++++++++++- src/ops/add_bias_residual_layer_norm.cu | 564 ++++++++++++++++- src/ops/fused.cpp | 14 +- src/ops/fused.cu | 14 +- src/ops/residual_layer_norm.cc | 18 +- src/runtime/model.cc | 34 + 8 files changed, 1568 insertions(+), 85 deletions(-) diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index bb470376c3..5c4a49f998 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -38,6 +38,11 @@ class AddBiasResidualLayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -61,6 +66,14 @@ class AddBiasResidualLayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -76,16 +89,51 @@ class AddBiasResidualLayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta); + template + static void backward_kernel(AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + ffStream_t stream); + static void + backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + template + static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + ffStream_t stream); + static void + peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias; @@ -107,6 +155,8 @@ class AddBiasResidualLayerNormMeta : public OpMeta { float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; }; }; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 42fbb3016a..5d19dffdbc 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -517,10 +517,6 @@ void AddBiasResidualLayerNorm::forward(FFModel const &ff) { assert(false); } -void AddBiasResidualLayerNorm::backward(FFModel const &ff) { - assert(false); -} - FutureMap AddBiasResidualLayerNorm::inference( FFModel const &ff, BatchConfigFuture const &bc, @@ -546,50 +542,51 @@ FutureMap AddBiasResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - // attn output + // input launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); + // attn bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(1, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual + launcher.add_field(2, FID_DATA); + // added_output: input + attn bias + residual launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); - // layer norm output + launcher.add_field(3, FID_DATA); + // output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); - // attn final bias - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); launcher.add_field(4, FID_DATA); if (elementwise_affine) { + // gamma launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); launcher.add_field(5, FID_DATA); - if (use_bias) { + // beta launcher.add_region_requirement(RegionRequirement(weights[2]->part, 0 /*projection id*/, READ_ONLY, @@ -602,11 +599,11 @@ FutureMap AddBiasResidualLayerNorm::inference( } /* - regions[0](I): attn output - regions[1](I): residual - regions[2](O): added output (attn output + final attn bias + residual) - regions[3](O): layer norm output - regions[4](I): final attn bias + regions[0](I): input + regions[1](I): attn bias + regions[2](I): residual + regions[3](O): added output + regions[4](O): output regions[5](I): gamma regions[6](I): beta */ @@ -630,26 +627,28 @@ void AddBiasResidualLayerNorm::inference_task( GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + m->output_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); - GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + m->output_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma, beta; Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Domain residual_domain = runtime->get_index_space_domain( + Domain attn_bias_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - Domain added_out_domain = runtime->get_index_space_domain( + Domain residual_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - Domain out_domain = runtime->get_index_space_domain( + Domain added_out_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - Domain attn_bias_domain = runtime->get_index_space_domain( + Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[4].region.get_index_space()); + Domain gamma_domain, beta_domain; assert(in_domain.get_volume() == out_domain.get_volume()); @@ -707,16 +706,7 @@ void AddBiasResidualLayerNorm::inference_task( } AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - (int)attn_bias_dim, - (int)residual_domain.get_volume(), - input, - added_output, - output, - residual, - attn_bias, - gamma, - beta); + m, bc, input, attn_bias, residual, added_output, output, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -739,6 +729,288 @@ void AddBiasResidualLayerNorm::inference_task( } } +void AddBiasResidualLayerNorm::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // added output + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // attn bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_bias) { + // beta_grad + launcher.add_region_requirement( + RegionRequirement(weights[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + } + runtime->execute_index_space(ctx, launcher); +} + +void AddBiasResidualLayerNorm::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + AddBiasResidualLayerNormMeta *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + assert(regions.size() == + 5 + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR added_output = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW attn_bias_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + gamma_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + if (m->use_bias) { + beta_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + } + AddBiasResidualLayerNorm::backward_kernel_wrapper(m, + output_grad, + added_output, + input_grad, + residual_grad, + attn_bias_grad, + gamma, + gamma_grad, + beta_grad); +} + +Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // residual grad + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // attn bias grad + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[2]->region)); + launcher.add_field(field_id++, FID_DATA); + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +void AddBiasResidualLayerNorm::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + AddBiasResidualLayerNormMeta const *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + assert(regions.size() == 4 + m->elementwise_affine); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + + GenericTensorAccessorW attn_bias_grad = + helperGetGenericTensorAccessorRW(m->weight_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma); +} + bool AddBiasResidualLayerNorm::measure_operator_cost( Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { return false; diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index 1add43ecd9..a0fdd1d1f7 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -23,6 +23,7 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( FFHandler handle, @@ -74,6 +75,23 @@ __inline__ __device__ T WarpReduceSum(T val) { return val; } +template +__inline__ __device__ T BlockReduceSum(T val, T *shared) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + template __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { int const lid = threadIdx.x % C10_WARP_SIZE; @@ -216,19 +234,77 @@ void AddBiasResidualLayerNorm::inference_kernel( /*static*/ void AddBiasResidualLayerNorm::inference_kernel_wrapper( - AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = + added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + // inference kernel + int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1; + int residual_volume = residual.domain.get_volume(); if (m->input_type[0] == DT_FLOAT) { AddBiasResidualLayerNorm::inference_kernel( m, @@ -260,4 +336,513 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual1_i = dX_residual1 + i1 * N; + T *dX_residual2_i = + (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] += f_grad_input; + dX_residual1_i[l] += f_grad_input; + if (dX_residual2 != nullptr) { + dX_residual2_i[l] += f_grad_input; + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf); +} + +/*static*/ +template +void AddBiasResidualLayerNorm::backward_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + attn_bias_grad_ptr, + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), + B, + dim3(kThreadX, kThreadY), + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void AddBiasResidualLayerNorm::backward_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::peft_bwd_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + static_cast(m->input_activation), + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + attn_bias_grad_ptr, + N); +} + +/*static*/ +void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 07f1f2af6b..097ace3676 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -22,6 +22,7 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( FFHandler handle, @@ -73,6 +74,23 @@ __inline__ __device__ T WarpReduceSum(T val) { return val; } +template +__inline__ __device__ T BlockReduceSum(T val, T *shared) { + int const lid = threadIdx.x % C10_WARP_SIZE; + int const wid = threadIdx.x / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + template __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { int const lid = threadIdx.x % C10_WARP_SIZE; @@ -189,14 +207,13 @@ void AddBiasResidualLayerNorm::inference_kernel( /*static*/ void AddBiasResidualLayerNorm::inference_kernel_wrapper( - AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta) { cudaStream_t stream; @@ -208,6 +225,65 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = + added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + tokens_previous_requests * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + // inference kernel + int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1; + int residual_volume = residual.domain.get_volume(); if (m->input_type[0] == DT_FLOAT) { AddBiasResidualLayerNorm::inference_kernel( m, @@ -297,4 +373,482 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual1_i = dX_residual1 + i1 * N; + T *dX_residual2_i = + (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + dX_i[l] += f_grad_input; + dX_residual1_i[l] += f_grad_input; + if (dX_residual2 != nullptr) { + dX_residual2_i[l] += f_grad_input; + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf); +} + +/*static*/ +template +void AddBiasResidualLayerNorm::backward_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + attn_bias_grad_ptr, + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void AddBiasResidualLayerNorm::backward_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::peft_bwd_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + attn_bias_grad_ptr, + N); +} + +/*static*/ +void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 5fa18013e9..77ca372d2c 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -976,7 +976,7 @@ __host__ void case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 2); - AddBiasResidualLayerNormMeta const *m = + AddBiasResidualLayerNormMeta *m = (AddBiasResidualLayerNormMeta *)metas->meta[op]; if (!m->elementwise_affine) { assert(fused->op_num_weights[op] == 1); // attn bias @@ -994,20 +994,14 @@ __host__ void beta = my_weight_accessor[2]; } } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); AddBiasResidualLayerNorm::inference_kernel_wrapper( m, - attn_bias_dim, - residual_volume, + bc, my_input_accessor[0], + my_weight_accessor[0], + my_input_accessor[1], my_output_accessor[0], my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], gamma, beta); break; diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 255136099a..383e171662 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -578,7 +578,7 @@ __host__ void case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 2); - AddBiasResidualLayerNormMeta const *m = + AddBiasResidualLayerNormMeta *m = (AddBiasResidualLayerNormMeta *)metas->meta[op]; if (!m->elementwise_affine) { assert(fused->op_num_weights[op] == 1); // attn bias @@ -596,20 +596,14 @@ __host__ void beta = my_weight_accessor[2]; } } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); AddBiasResidualLayerNorm::inference_kernel_wrapper( m, - attn_bias_dim, - residual_volume, + bc, my_input_accessor[0], + my_weight_accessor[0], + my_input_accessor[1], my_output_accessor[0], my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], gamma, beta); break; diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index ce82ec6702..6c1f4ef934 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -639,7 +639,7 @@ void ResidualLayerNorm::backward_task( runtime); GenericTensorAccessorW residual2_grad; if (m->use_two_residuals) { - GenericTensorAccessorW residual2_grad = + residual2_grad = helperGetGenericTensorAccessorRW(m->input_type[2], regions[region_idx++], task->regions[task_region_idx++], @@ -708,33 +708,33 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( machine_view_hash); int field_id = 0; // output_grad - launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - outputs[1]->region_grad)); + batch_outputs[1]->region)); launcher.add_field(field_id++, FID_DATA); // input grad - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - inputs[0]->region_grad)); + batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual grad 1 - launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - inputs[1]->region_grad)); + batch_inputs[1]->region)); launcher.add_field(field_id++, FID_DATA); if (use_two_residuals) { // residual grad 2 - launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad, + launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - inputs[2]->region_grad)); + batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } if (elementwise_affine) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 8939e9e74d..500146b42c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5327,6 +5327,40 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + "AddBiasResidualLayerNorm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AddBiasResidualLayerNorm::backward_task>( + registrar, "AddBiasResidualLayerNorm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + "AddBiasResidualLayerNorm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AddBiasResidualLayerNorm::peft_bwd_task>( + registrar, "AddBiasResidualLayerNorm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // SigmoidSiluMulti task { TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INIT_TASK_ID, From 60c0418301d9ccb935cc9f9807a4702170afd64a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 22 Oct 2023 14:50:24 +0000 Subject: [PATCH 036/198] implement IncMHA peft_bwd --- .../ops/inc_multihead_self_attention.h | 3 + src/ops/inc_multihead_self_attention.cu | 336 +++++++++++++++++- 2 files changed, 338 insertions(+), 1 deletion(-) diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 76569de4cb..4fe79a1d87 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -218,6 +218,9 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { // typedef hipFloatComplex attFloatComplex; hipFloatComplex *complex_input; #endif + // PEFT specific fields + void *softmax_activation_buffer; + void *query_activation_buffer; }; }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 0e3d90e02c..58831292ae 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -455,7 +455,341 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *output_grad_ptr, DT const *bias_ptr, cudaStream_t stream) { - assert(false); + assert(!m->offload); + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; +#endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + // Currently assume we are calculating gradients for all tokens + // of a request + assert(num_tokens == num_total_tokens); + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + // Step 1: compute gradients before final projection + { + int m_ = m->vProjSize * m->num_q_heads; + int n_ = num_tokens; + int k_ = m->oProjSize; + int lda = k_; + int ldb = n_; + int ldc = m_; + float alpha = 1.0f, beta = 0.0f; + // matrix A: output projection weight + // matrix A's layout: [num_heads, vProjSize, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: output gradients + // matrix B's layout: [num_new_tokens, oProjSize] + DT const *B = + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; + // matrix C: attn_heads gradients + // matrix C's layout: [num_new_tokens, num_heads, vProjSize] + DT *C = static_cast
(m->handle.workSpace); + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Step 2: compute gradients w.r.t. value + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: attn_heads gradients + // matrix A's layout: [num_tokens, num_heads, vProjSize] + DT const *A = static_cast
(m->handle.workSpace); + // matrix B: qk_prods_softmax + // matrix B's layout: [num_heads, num_tokens, num_tokens] + DT const *B = static_cast
(m->qk_prods_softmax); + // matrix C: gradients for value (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize + + // vProjSize] + DT *C = + static_cast
(m->devQKVProjArray) + m->qProjSize + m->kProjSize; + int m_ = m->vProjSize; + int n_ = num_tokens; + int k_ = num_tokens; + int lda = m->vProjSize * m->num_q_heads; + int ldb = num_tokens; + int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int strideA = m->vProjSize; + int strideB = num_tokens * num_tokens; + int strideC = m->qProjSize + m->kProjSize + m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor + { + float alpha = 1.0f, beta = 0.0f; + int m_ = num_tokens; + int n_ = num_tokens; + int k_ = m->vProjSize; + int lda = m->vProjSize * m->num_q_heads; + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = m->vProjSize; + int strideB = m->vProjSize; + int strideC = num_tokens * num_tokens; + // matrix A: value cache + // matrix A's layout: [num_req, max_num_tokens, num_heads, vProjSize] + DT const *A = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix B: attn_heads gradients + // matrix B's layout: [num_new_tokens, num_heads, vProjSize] + DT const *B = static_cast
(m->handle.workSpace); + // matrix C: qk_prods_softmax gradients + // matrix C's layout: [num_heads, num_total_tokens, num_new_tokens] + DT *C = static_cast
(m->qk_prods_softmax); + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Step 4: softmax backpropagation + { + float alpha = 1.0f, beta = 0.0f; + int n_param = m->num_q_heads; + int c_param = num_tokens; + int h_param = 1; + int w_param = num_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->qk_tensor, + m->softmax_activation_buffer, + m->qk_tensor, + m->qk_prods_softmax, + &beta, + m->qk_tensor, + m->qk_prods)); + // TODO: fill all elements above diagonal to force causal attention + } + // Step 5: compute gradients w.r.t. key + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: query activation (in query_activation_buffer) + // matrix A's layout: [num_tokens, num_heads, m->qProjSize] + DT const *A = static_cast
(m->query_activation_buffer); + // matrix B: gradients w.r.t. qk_prods + // matrix B's layout: [num_heads, num_tokens, num_tokens] + DT const *B = static_cast
(m->qk_prods); + // matrix C: gradients w.r.t. key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize + + // vProjSize] + DT *C = static_cast
(m->devQKVProjArray) + m->qProjSize; + int m_ = m->kProjSize; + int n_ = num_tokens; + int k_ = num_tokens; + int lda = m->num_q_heads * m->qProjSize; + int ldb = num_tokens; + int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int strideA = m->qProjSize; + int strideB = num_tokens * num_tokens; + int strideC = m->qProjSize + m->kProjSize + m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Step 6: compute gradients w.r.t query + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: key cache + // matrix A's layout: [num_tokens, num_heads, m->kProjSize] + DT const *A = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix B: gradients w.r.t. qk_prods + // matrix B's layout: [num_heads, num_tokens, num_tokens] + DT const *B = static_cast
(m->qk_prods); + // matrix C: gradients w.r.t. query (saved as part of m->devQKVProjArray) + // matrix C's layout: + // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize] + DT *C = static_cast
(m->devQKVProjArray); + int m_ = m->qProjSize; + int n_ = num_tokens; + int k_ = num_tokens; + int lda = m->kProjSize * m->num_q_heads; + int ldb = num_tokens; + int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int strideA = m->kProjSize; + int strideB = num_tokens * num_tokens; + int strideC = m->qProjSize + m->kProjSize + m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Step 7: compute gradients w.r.t. input + { + float alpha = 1.0f, beta = 0.0f; + if (!m->reset_input_grads[0]) { + beta = 1.0f; + } + // matrix A: QKV projection weights + // matrix A's layout: + // [(qProjSize + kProjSize + vProjSize) * num_q_heads, qSize] + DT const *A = weight_ptr; + // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) + // matrix B's layout: + // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize] + DT const *B = static_cast
(m->devQKVProjArray); + // matrix C: gradients w.r.t. input + // matrix C's layout: [num_tokens, m->qSize] + DT *C = input_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; + int m_ = m->qSize; + int n_ = num_tokens; + int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int lda = m_; + int ldb = k_; + int ldc = m_; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } } } // namespace IncMultiHeadAttention From 509c54cec8bd7be1ffa99282f338afe3e9126b87 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sun, 22 Oct 2023 17:31:01 -0400 Subject: [PATCH 037/198] several bug fixes --- src/ops/kernels/lora_linear_kernels.cu | 4 ++-- src/ops/sigmoid_silu_multi.cpp | 10 ++++++++-- src/ops/sigmoid_silu_multi.cu | 10 ++++++++-- src/runtime/model.cc | 3 ++- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index a3fc071f11..8ea2455cd0 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -263,9 +263,9 @@ void peft_bwd_kernel(LoraLinearMeta *m, checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); DT alpha = 1.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); - cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); - assert(weight_type == ff_to_cuda_datatype(m->weight_type[1])); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + assert(input_type == output_type); + cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; #if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index ccd622ff17..0a9a814f5e 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -293,8 +293,14 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper( num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; } } - assert(num_peft_requests == 1); - assert(num_peft_tokens >= 1); + if (num_peft_requests == 0) { + // No PEFT requests + return; + } else { + // Otherwise assume at most 1 peft request + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + } int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; if (m->input_type[0] == DT_FLOAT) { diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 597f7ecdab..bb78973f70 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -283,8 +283,14 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper( num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; } } - assert(num_peft_requests == 1); - assert(num_peft_tokens >= 1); + if (num_peft_requests == 0) { + // No PEFT requests + return; + } else { + // Otherwise assume at most 1 peft request + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + } int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; if (m->input_type[0] == DT_FLOAT) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 500146b42c..4ccfe25a97 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1486,7 +1486,8 @@ OpMeta::OpMeta(FFHandler _handle) #endif OpMeta::OpMeta(FFHandler _handle, Op const *op) - : profiling(op->profiling), inference_debugging(op->inference_debugging) { + : handle(_handle), profiling(op->profiling), + inference_debugging(op->inference_debugging) { for (int i = 0; i < op->numInputs; i++) { trainable_inputs[i] = op->trainable_inputs[i]; reset_input_grads[i] = op->reset_input_grads[i]; From bc9f538ac4ae7c274e96de95f14af374d20d1896 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 23 Oct 2023 01:04:06 +0000 Subject: [PATCH 038/198] [rms_norm] do not compute non-peft-bwd tokens in peft-bwd --- .../ops/kernels/residual_rms_norm_kernels.h | 1 + .../flexflow/ops/kernels/rms_norm_kernels.h | 1 + src/ops/fused.cu | 3 +- src/ops/kernels/lora_linear_kernels.cu | 24 ++--- src/ops/kernels/residual_rms_norm_kernels.cu | 90 +++++++++++-------- src/ops/kernels/rms_norm_kernels.cu | 54 +++++++---- src/ops/residual_layer_norm.cc | 1 + src/ops/residual_rms_norm.cc | 4 +- src/ops/rms_norm.cc | 4 +- 9 files changed, 107 insertions(+), 75 deletions(-) diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 4fbe34f83f..3091f83675 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -67,6 +67,7 @@ void backward_kernel_wrapper( GenericTensorAccessorR const &weight, GenericTensorAccessorW const &weight_grad); void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &residual_input0_grad, GenericTensorAccessorW const &residual_input1_grad, diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 72176f0383..92e5e04af3 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -62,6 +62,7 @@ void backward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &weight_grad); void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &weight); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 383e171662..1f6614d341 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -994,19 +994,20 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_outputs[op] == 1); RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; Kernels::RMSNorm::peft_bwd_kernel_wrapper(m, + bc, my_output_grad_accessor[0], my_input_grad_accessor[0], my_weight_accessor[0]); break; } case OP_RESIDUAL_RMS_NORM: { - // TODO: implement me assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( m, + bc, my_output_grad_accessor[0], my_input_grad_accessor[0], my_input_grad_accessor[1], diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 8ea2455cd0..c26803bcee 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -166,18 +166,16 @@ void inference_kernel(LoraLinearMeta *m, } // Assert that we have at most one request that requires peft_bwd assert(num_peft_requests <= 1); - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - // FIXME: use the new approach to computing token offset - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) != m->model_weights.end()); LoraLinearWeight weight = @@ -192,7 +190,7 @@ void inference_kernel(LoraLinearMeta *m, data_type_size(m->input_type[1]) * num_peft_tokens * rank); // copy input activation checkCUDA(cudaMemcpyAsync(m->input_activation, - input_ptr + tokens_previous_requests * in_dim, + input_ptr + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, @@ -215,7 +213,7 @@ void inference_kernel(LoraLinearMeta *m, weight.w0_ptr, weight_type, in_dim, - input_ptr + tokens_previous_requests * in_dim, + input_ptr + first_token_offset * in_dim, input_type, in_dim, &beta, @@ -241,14 +239,12 @@ void inference_kernel(LoraLinearMeta *m, lr_actv_type, rank, &alpha, - output_ptr + tokens_previous_requests * out_dim, + output_ptr + first_token_offset * out_dim, output_type, out_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - tokens_previous_requests += num_peft_tokens; } - assert(tokens_previous_requests == bc->num_active_tokens()); } template @@ -274,22 +270,20 @@ void peft_bwd_kernel(LoraLinearMeta *m, cudaDataType_t compute_type = CUDA_R_32F; #endif - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } // Skip PEFT forward-only requests if (!bc->requestsInfo[i].peft_bwd) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) != m->model_weights.end()); LoraLinearWeight weight = @@ -307,7 +301,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, m->low_rank_activation, lr_actv_type, rank, - output_grad_ptr + tokens_previous_requests * out_dim, + output_grad_ptr + first_token_offset * out_dim, output_type, out_dim, &alpha, @@ -329,7 +323,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, weight.w1_ptr, weight_type, rank, - output_grad_ptr + tokens_previous_requests * out_dim, + output_grad_ptr + first_token_offset * out_dim, output_type, out_dim, &alpha, @@ -376,15 +370,13 @@ void peft_bwd_kernel(LoraLinearMeta *m, lr_actv_type, rank, &alpha, - input_grad_ptr + tokens_previous_requests * in_dim, + input_grad_ptr + first_token_offset * in_dim, input_type, in_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - tokens_previous_requests += num_peft_tokens; } - assert(tokens_previous_requests == bc->num_active_tokens()); } } // namespace Internal diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 53804c0b1b..de84e50e29 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -274,18 +274,16 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, } assert(num_peft_requests <= 1); - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - // FIXME: use the new approach to computing token offset - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; @@ -293,21 +291,19 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); // copy input activation if (m->input_type[0] == DT_FLOAT) { - checkCUDA(cudaMemcpyAsync(m->input_activation, - residual_output.get_float_ptr() + - tokens_previous_requests * in_dim, - data_type_size(m->input_type[0]) * - num_peft_tokens * in_dim, - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA(cudaMemcpyAsync( + m->input_activation, + residual_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); } else if (m->input_type[0] == DT_HALF) { - checkCUDA(cudaMemcpyAsync(m->input_activation, - residual_output.get_half_ptr() + - tokens_previous_requests * in_dim, - data_type_size(m->input_type[0]) * - num_peft_tokens * in_dim, - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA(cudaMemcpyAsync( + m->input_activation, + residual_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); } else { assert(false && "unsupport datatype in layernorm"); } @@ -437,33 +433,48 @@ void backward_kernel(ResidualRMSNormMeta const *m, template void peft_bwd_kernel(ResidualRMSNormMeta const *m, + BatchConfig const *bc, T const *output_grad_ptr, T *residual_input0_grad_ptr, T *residual_input1_grad_ptr, T const *weight_ptr, cudaStream_t stream) { - const int64_t M = m->batch_size; - const int64_t N = m->num_elements; - T const *residual_output_rms_input_ptr = - static_cast(m->input_activation); - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - residual_output_rms_input_ptr, - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr)); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } - RMSNormBackwardCUDAKernel - <<>>(N, - output_grad_ptr, - residual_output_rms_input_ptr, - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - residual_input0_grad_ptr, - residual_input1_grad_ptr); + const int64_t M = bc->requestsInfo[i].num_tokens_in_batch; + const int64_t N = m->num_elements; + T const *residual_output_rms_input_ptr = + static_cast(m->input_activation); + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel + <<>>(N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr); + } } /* @@ -536,6 +547,7 @@ void backward_kernel_wrapper( regions[3](I): weight */ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &residual_input0_grad, GenericTensorAccessorW const &residual_input1_grad, @@ -554,6 +566,7 @@ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, if (output_grad.data_type == DT_HALF) { peft_bwd_kernel(m, + bc, output_grad.get_half_ptr(), residual_input0_grad.get_half_ptr(), residual_input1_grad.get_half_ptr(), @@ -561,6 +574,7 @@ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, stream); } else if (output_grad.data_type == DT_FLOAT) { peft_bwd_kernel(m, + bc, output_grad.get_float_ptr(), residual_input0_grad.get_float_ptr(), residual_input1_grad.get_float_ptr(), diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index ffb92613a5..8281506cbf 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -453,31 +453,47 @@ void backward_kernel_wrapper(RMSNormMeta const *m, template void peft_bwd_kernel(RMSNormMeta const *m, + BatchConfig const *bc, T const *output_grad_ptr, T *input_grad_ptr, T const *weight_ptr, cudaStream_t stream) { - const int64_t M = m->batch_size; - const int64_t N = m->num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->c2_ptr)); - RMSNormBackwardCUDAKernel - <<>>(N, - output_grad_ptr, - static_cast(m->input_activation), - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->c2_ptr), - input_grad_ptr); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + const int64_t M = bc->requestsInfo[i].num_tokens_in_batch; + const int64_t N = m->num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->c2_ptr)); + RMSNormBackwardCUDAKernel<<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->c2_ptr), + input_grad_ptr); + } } void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &weight) { @@ -494,12 +510,14 @@ void peft_bwd_kernel_wrapper(RMSNormMeta const *m, if (output_grad.data_type == DT_HALF) { peft_bwd_kernel(m, + bc, output_grad.get_half_ptr(), input_grad.get_half_ptr(), weight.get_half_ptr(), stream); } else if (output_grad.data_type == DT_FLOAT) { peft_bwd_kernel(m, + bc, output_grad.get_float_ptr(), input_grad.get_float_ptr(), weight.get_float_ptr(), diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 6c1f4ef934..754b6105fa 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -706,6 +706,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); int field_id = 0; // output_grad launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index e2bc29635a..a6ed1dca9b 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -616,6 +616,7 @@ Legion::FutureMap false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); // regions[0](I): RMS output_grad launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -660,6 +661,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, assert(task->regions.size() == 4); assert(regions.size() == 4); ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW residual_input0_grad = @@ -679,7 +681,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper( - m, output_grad, residual_input0_grad, residual_input1_grad, weight); + m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); } Op *ResidualRMSNorm::materialize(FFModel &ff, diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 332472e8e4..3c1b4d2570 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -529,6 +529,7 @@ Legion::FutureMap false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.add_future(bc); // regions[0](I): output_grad launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -566,13 +567,14 @@ void RMSNorm::peft_bwd_task(Task const *task, assert(task->regions.size() == 3); assert(regions.size() == 3); RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); - peft_bwd_kernel_wrapper(m, output_grad, input_grad, weight); + peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight); } void RMSNorm::serialize(Legion::Serializer &sez) const { From d8e92e9bfce26d897688260cfba1bb61cebb069a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 23 Oct 2023 12:12:50 -0400 Subject: [PATCH 039/198] . --- include/flexflow/request_manager.h | 8 ++- src/runtime/request_manager.cc | 80 +++++++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 637b9623f1..52da9a38ba 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -70,6 +70,11 @@ struct Request { std::vector beam_trees; }; +struct PEFTRequest : public Request { + std::vector < std::pair < std::vector, + std::vector dataset; +}; + // store the result of beam search struct BeamTree { struct treeLayer { @@ -227,7 +232,8 @@ class RequestManager { int bos_token_id; int eos_token_id; std::string output_filepath; - std::queue pending_request_queue; + std::queue pending_infr_request_queue; + std::queue pending_peft_request_queue; std::unordered_map all_requests; std::unordered_map request_generation_results; std::mutex request_queue_mutex; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index c0573a50a3..603be8b00d 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -212,7 +212,7 @@ RequestManager::RequestGuid } } - pending_request_queue.push(request); + pending_infr_request_queue.push(request); all_requests[request.guid] = request; if (verbose) { @@ -274,7 +274,71 @@ RequestManager::RequestGuid } } - pending_request_queue.push(request); + pending_infr_request_queue.push(request); + all_requests[request.guid] = request; + { + std::string output = "New request tokens:"; + output = "[" + std::to_string(request.guid) + "]" + output; + for (int i = 0; i < request.tokens.size(); i++) { + output = output + " " + std::to_string(request.tokens[i]); + } + log_req_mgr.print("%s", output.c_str()); + } + + GenerationResult gr; + gr.guid = request.guid; + gr.input_text = prompt; + gr.input_tokens = request.tokens; + gr.output_text = prompt; + gr.output_tokens = request.tokens; + request_generation_results[request.guid] = gr; + return request.guid; +} + +RequestManager::RequestGuid RequestManager::register_new_peft_request( + std::vector> const &dataset, + int max_sequence_length, + PEFTModelID peft_model_id) { + const std::lock_guard lock(request_queue_mutex); + // Add a new request + PEFTRequest request; + request.status = Request::PENDING; + request.guid = next_available_guid++; + request.max_sequence_length = max_sequence_length; + request.peft_model_id = peft_model_id; + for (auto const &sample : dataset) { + std::vector input_tokens; + if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + input_tokens.push_back(bos_token_id); + } + input_tokens.push_back(this->tokenizer_->Encode(sample.first)); + std::vector output_tokens = + this->tokenizer_->Encode(sample.second); + if (input_tokens.size() + output_tokens.size() > + get_max_sequence_length()) { + std::cout << "Warning: too many tokens in sample, only load up to " + << get_max_sequence_length() << " tokens, but got " + << tokens.size() << ".\n"; + } else { + request.dataset.push_back(std::make_pair(input_tokens, output_tokens); + } + } + + // Currently don't support speculative inference for PEFT + assert(get_num_ssms() == 0); + if (get_num_ssms() == 0) { + std::cout << "No small speculative model registered, using incremental " + "decoding." + << std::endl; + } else { + std::cout << "Num of models: " << get_num_ssms() << std::endl; + for (int i = 0; i < get_num_ssms(); i++) { + BeamTree beam_tree = BeamTree{}; + request.beam_trees.push_back(beam_tree); + } + } + + pending_infr_request_queue.push(request); all_requests[request.guid] = request; { std::string output = "New request tokens:"; @@ -368,10 +432,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, BatchConfig new_bc; for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (old_bc.request_completed[i]) { // add new requests to the next batch - if (!pending_request_queue.empty() && + if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); + Request new_request = pending_infr_request_queue.front(); + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; @@ -785,10 +849,10 @@ BeamSearchBatchConfig // Step 2: Initialize new request for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && + if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); + Request new_request = pending_infr_request_queue.front(); + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; From 0a512d25f05e9b98e275cc83f60544b7c60cd9f0 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 23 Oct 2023 20:11:16 -0400 Subject: [PATCH 040/198] . --- include/flexflow/model.h | 6 +- include/flexflow/request_manager.h | 14 ++-- inference/incr_decoding/incr_decoding.cc | 10 ++- src/ops/kernels/lora_linear_kernels.cu | 27 +++----- src/runtime/model.cu | 17 +---- src/runtime/request_manager.cc | 84 +++++++++++++++++++----- 6 files changed, 96 insertions(+), 62 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 5d986c1329..b4d2fe53af 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -830,7 +830,11 @@ class FFModel { // ======================================== // Inference APIs // ======================================== - GenerationResult generate(std::vector &prompts, + GenerationResult generate(std::string const &prompts, + int max_seq_length, + PEFTModelID peft_model_id = PEFTModelID::NO_ID); + + GenerationResult generate(std::vector const &prompts, int max_seq_length, PEFTModelID peft_model_id = PEFTModelID::NO_ID); diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 52da9a38ba..f93fc4c080 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -66,13 +66,11 @@ struct Request { Status status = PENDING; std::vector tokens; - std::vector beam_trees; -}; - -struct PEFTRequest : public Request { - std::vector < std::pair < std::vector, - std::vector dataset; + // PEFT field + std::vector, + std::vector>> + dataset; }; // store the result of beam search @@ -118,11 +116,11 @@ class RequestManager { FFModel *get_model(int model_id); GenerationResult generate_incr_decoding(FFModel *model, - std::vector &prompts, + std::vector const &prompts, int max_seq_length, PEFTModelID peft_model_id); GenerationResult generate_spec_infer(FFModel *model, - std::vector &prompts, + std::vector const &prompts, int max_seq_length, PEFTModelID peft_model_id); GenerationResult get_generation_result(RequestGuid const &guid); diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 461d71b23a..9f3a0a4a5f 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -272,14 +272,20 @@ void FlexFlow::top_level_task(Task const *task, /*allow_exceptions */ true, /*ignore_comments */ true); std::vector prompts; + std::vector> dataset; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; prompts.push_back(text); + dataset.push_back(std::make_pair(text, text)); } - GenerationResult result = - model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id); + rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/, peft_model_id); + for (auto &prompt : prompts) { + GenerationResult result = model.generate(prompt, 128 /*max_sequence_length*/); + } + //GenerationResult result = + // model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 8ea2455cd0..fd64c4710b 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -150,7 +150,7 @@ void inference_kernel(LoraLinearMeta *m, // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - cudaDataType_t compute_type = input_type; + cudaDataType_t compute_type = output_type; #endif int num_peft_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -166,18 +166,16 @@ void inference_kernel(LoraLinearMeta *m, } // Assert that we have at most one request that requires peft_bwd assert(num_peft_requests <= 1); - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - // FIXME: use the new approach to computing token offset - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) != m->model_weights.end()); LoraLinearWeight weight = @@ -192,7 +190,7 @@ void inference_kernel(LoraLinearMeta *m, data_type_size(m->input_type[1]) * num_peft_tokens * rank); // copy input activation checkCUDA(cudaMemcpyAsync(m->input_activation, - input_ptr + tokens_previous_requests * in_dim, + input_ptr + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, @@ -215,7 +213,7 @@ void inference_kernel(LoraLinearMeta *m, weight.w0_ptr, weight_type, in_dim, - input_ptr + tokens_previous_requests * in_dim, + input_ptr + first_token_offset * in_dim, input_type, in_dim, &beta, @@ -241,14 +239,12 @@ void inference_kernel(LoraLinearMeta *m, lr_actv_type, rank, &alpha, - output_ptr + tokens_previous_requests * out_dim, + output_ptr + first_token_offset * out_dim, output_type, out_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - tokens_previous_requests += num_peft_tokens; } - assert(tokens_previous_requests == bc->num_active_tokens()); } template @@ -271,22 +267,19 @@ void peft_bwd_kernel(LoraLinearMeta *m, // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - cudaDataType_t compute_type = CUDA_R_32F; + cudaDataType_t compute_type = output_type; #endif - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } // Skip PEFT forward-only requests if (!bc->requestsInfo[i].peft_bwd) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; @@ -307,7 +300,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, m->low_rank_activation, lr_actv_type, rank, - output_grad_ptr + tokens_previous_requests * out_dim, + output_grad_ptr, output_type, out_dim, &alpha, @@ -329,7 +322,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, weight.w1_ptr, weight_type, rank, - output_grad_ptr + tokens_previous_requests * out_dim, + output_grad_ptr, output_type, out_dim, &alpha, @@ -376,15 +369,13 @@ void peft_bwd_kernel(LoraLinearMeta *m, lr_actv_type, rank, &alpha, - input_grad_ptr + tokens_previous_requests * in_dim, + input_grad_ptr, input_type, in_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - tokens_previous_requests += num_peft_tokens; } - assert(tokens_previous_requests == bc->num_active_tokens()); } } // namespace Internal diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 0c69c9a600..754a6b18d7 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -160,23 +160,10 @@ FFHandler .only_kind(Memory::GPU_FB_MEM) .best_affinity_to(task->target_proc) .first(); - Realm::Rect<1, coord_t> bounds( - Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(info->peft_activation_reserve_space_size - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); Realm::RegionInstance workspaceInst; - Realm::RegionInstance::create_instance(workspaceInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); handle.peft_activation_allocator = new MemoryAllocator(gpu_mem); - handle.peft_activation_allocator->register_reserved_work_space( - ptr, info->peft_activation_reserve_space_size); + handle.peft_activation_allocator->create_legion_instance( + workspaceInst, info->peft_activation_reserve_space_size); } if (info->peft_weight_reserve_space_size > 0) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 603be8b00d..5631ea6523 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -301,26 +301,26 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request( PEFTModelID peft_model_id) { const std::lock_guard lock(request_queue_mutex); // Add a new request - PEFTRequest request; + Request request; request.status = Request::PENDING; request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; request.peft_model_id = peft_model_id; for (auto const &sample : dataset) { std::vector input_tokens; + input_tokens = this->tokenizer_->Encode(sample.first); if (bos_token_id >= 0 && model_type != ModelType::FALCON) { - input_tokens.push_back(bos_token_id); + input_tokens.insert(input_tokens.begin(), bos_token_id); } - input_tokens.push_back(this->tokenizer_->Encode(sample.first)); std::vector output_tokens = this->tokenizer_->Encode(sample.second); if (input_tokens.size() + output_tokens.size() > get_max_sequence_length()) { std::cout << "Warning: too many tokens in sample, only load up to " << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; + << input_tokens.size() + output_tokens.size() << ".\n"; } else { - request.dataset.push_back(std::make_pair(input_tokens, output_tokens); + request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); } } @@ -338,23 +338,29 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request( } } - pending_infr_request_queue.push(request); + pending_peft_request_queue.push(request); all_requests[request.guid] = request; { - std::string output = "New request tokens:"; - output = "[" + std::to_string(request.guid) + "]" + output; - for (int i = 0; i < request.tokens.size(); i++) { - output = output + " " + std::to_string(request.tokens[i]); + for (size_t r = 0; r < request.dataset.size(); r++) { + std::string input = "[" + std::to_string(r) + "] input:"; + std::string output = "[" + std::to_string(r) + "] output:"; + for (size_t i = 0; i < request.dataset[r].first.size(); i++) { + input = input + " " + std::to_string(request.dataset[r].first[i]); + } + for (size_t i = 0; i < request.dataset[r].second.size(); i++) { + output = output + " " + std::to_string(request.dataset[r].second[i]); + } + log_req_mgr.print("%s", input.c_str()); + log_req_mgr.print("%s", output.c_str()); } - log_req_mgr.print("%s", output.c_str()); } GenerationResult gr; gr.guid = request.guid; - gr.input_text = prompt; - gr.input_tokens = request.tokens; - gr.output_text = prompt; - gr.output_tokens = request.tokens; + //gr.input_text = prompt; + //gr.input_tokens = request.tokens; + //gr.output_text = prompt; + //gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; return request.guid; } @@ -569,6 +575,40 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } + // Step 3: add PEFT bwd requests + if (pending_peft_request_queue.size() > 0) { + Request &request = pending_peft_request_queue.front(); + assert(request.dataset.size() > 0); + int num_peft_tokens = request.dataset[0].first.size() + + request.dataset[0].second.size(); + if (num_peft_tokens + new_bc.num_active_tokens() <= get_max_tokens_per_batch()) { + // The last request slot is reserved for PEFT request + int peft_req_idx = get_max_requests_per_batch() - 1; + assert(new_bc.request_completed[peft_req_idx]); + new_bc.request_completed[peft_req_idx] = false; + new_bc.requestsInfo[peft_req_idx].first_token_depth_in_request = 0; + new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[peft_req_idx].num_tokens_in_batch = num_peft_tokens; + new_bc.requestsInfo[peft_req_idx].max_sequence_length = request.max_sequence_length; + new_bc.requestsInfo[peft_req_idx].request_guid = request.guid; + new_bc.requestsInfo[peft_req_idx].peft_model_id = request.peft_model_id; + new_bc.requestsInfo[peft_req_idx].peft_bwd = true; + for (size_t i = 0; i < request.dataset[0].first.size(); i++) { + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].first[i]; + new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i; + new_bc.num_tokens ++; + } + for (size_t i = 0; i < request.dataset[0].second.size(); i++) { + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].second[i]; + new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens; + int depth = request.dataset[0].first.size() + i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + new_bc.num_tokens ++; + } + } + } + return new_bc; } @@ -1875,7 +1915,15 @@ std::vector> return merged_tree; } -GenerationResult FFModel::generate(std::vector &prompts, +GenerationResult FFModel::generate(std::string const &prompt, + int max_seq_length, + PEFTModelID peft_model_id) { + std::vector prompts; + prompts.push_back(prompt); + return generate(prompts, max_seq_length, peft_model_id); +} + +GenerationResult FFModel::generate(std::vector const &prompts, int max_seq_length, PEFTModelID peft_model_id) { RequestManager *rm = RequestManager::get_request_manager(); @@ -1995,7 +2043,7 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, /*static*/ GenerationResult RequestManager::generate_incr_decoding(FFModel *llm, - std::vector &prompts, + std::vector const &prompts, int max_seq_length, PEFTModelID peft_model_id) { InferenceManager *im = InferenceManager::get_inference_manager(); @@ -2056,7 +2104,7 @@ GenerationResult /*static*/ GenerationResult RequestManager::generate_spec_infer(FFModel *llm, - std::vector &prompts, + std::vector const &prompts, int max_seq_length, PEFTModelID peft_model_id) { InferenceManager *im = InferenceManager::get_inference_manager(); From 4ee710a76ee4f47b4574c57519e2b0fb96efaa6a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 23 Oct 2023 20:24:58 -0400 Subject: [PATCH 041/198] Update the default cublas behavior when CUDA_VERSION is not specified --- src/ops/inc_multihead_self_attention.cpp | 14 ++++++++------ src/ops/inc_multihead_self_attention.cu | 12 ++++++------ src/ops/kernels/linear_kernels.cpp | 18 ++++++++++-------- src/ops/kernels/linear_kernels.cu | 12 ++++++------ src/ops/spec_inc_multihead_self_attention.cpp | 7 ++++--- src/ops/spec_inc_multihead_self_attention.cu | 6 +++--- src/ops/tree_inc_multihead_self_attention.cpp | 7 ++++--- src/ops/tree_inc_multihead_self_attention.cu | 6 +++--- 8 files changed, 44 insertions(+), 38 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 37cc986f5e..d60386f927 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -257,10 +257,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to HIPBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = HIPBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) @@ -509,10 +510,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 3b24a5a324..7080cbf05b 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -238,11 +238,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads @@ -508,11 +508,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 231ca0f3d7..4354409f54 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -241,11 +241,12 @@ void forward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else - hipblasDatatype_t compute_type = input_type; + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = hipblas_data_type; #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, @@ -337,11 +338,12 @@ void backward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else - hipblasDatatype_t compute_type = HIPBLAS_R_32F; + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = hipblas_data_type; #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 8a93357dcf..d8a9b5aa16 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -311,11 +311,11 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = input_type; #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -401,11 +401,11 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = CUDA_R_32F; #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 1d81ae0c11..b1687d12a2 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -200,10 +200,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index ac74eb1c8f..681c7a0f72 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -215,11 +215,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 1d9ebf67e0..26291fb3b4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -157,10 +157,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index edf7a2d075..758a93bbf7 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -158,11 +158,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; From 464424ee2c5cf3f4f27dd5e368cbf7b6351a57d1 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 23 Oct 2023 21:49:37 -0400 Subject: [PATCH 042/198] fix bugs in IncMHA peft_bwd kernel --- .../ops/inc_multihead_self_attention.h | 5 +- include/flexflow/request_manager.h | 9 +- inference/incr_decoding/incr_decoding.cc | 10 ++- src/ops/fused.cu | 4 +- src/ops/inc_multihead_self_attention.cu | 89 ++++++++++++++----- src/ops/kernels/linear_kernels.cpp | 9 +- src/ops/kernels/linear_kernels.cu | 6 +- src/ops/kernels/lora_linear_kernels.cu | 14 ++- src/runtime/request_manager.cc | 41 +++++---- 9 files changed, 121 insertions(+), 66 deletions(-) diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 4fe79a1d87..8da8412c69 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -121,15 +121,14 @@ class IncMultiHeadSelfAttention : public Op { bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const override; - - static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, GenericTensorAccessorR const &bias); - static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorW const &input_grad, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index f93fc4c080..a955eb0b9f 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -115,10 +115,11 @@ class RequestManager { FFModel *get_model(int model_id); - GenerationResult generate_incr_decoding(FFModel *model, - std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id); + GenerationResult + generate_incr_decoding(FFModel *model, + std::vector const &prompts, + int max_seq_length, + PEFTModelID peft_model_id); GenerationResult generate_spec_infer(FFModel *model, std::vector const &prompts, int max_seq_length, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 9f3a0a4a5f..b74292ad9d 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -280,12 +280,14 @@ void FlexFlow::top_level_task(Task const *task, prompts.push_back(text); dataset.push_back(std::make_pair(text, text)); } - rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/, peft_model_id); + rm->register_new_peft_request( + dataset, 256 /*max_sequence_length*/, peft_model_id); for (auto &prompt : prompts) { - GenerationResult result = model.generate(prompt, 128 /*max_sequence_length*/); + GenerationResult result = + model.generate(prompt, 128 /*max_sequence_length*/); } - //GenerationResult result = - // model.generate(prompts, 128 /*max_sequence_length*/); + // GenerationResult result = + // model.generate(prompts, 128 /*max_sequence_length*/); } // Execution fence diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 383e171662..51bfb6a390 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -447,7 +447,7 @@ __host__ void case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); - IncMultiHeadSelfAttentionMeta const *m = + IncMultiHeadSelfAttentionMeta *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; assert(fused->op_num_weights[op] == (1 + (int)(*m->qkv_bias || *m->final_bias))); @@ -1016,7 +1016,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); - IncMultiHeadSelfAttentionMeta const *m = + IncMultiHeadSelfAttentionMeta *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; assert(fused->op_num_weights[op] == (1 + (int)(*m->qkv_bias || *m->final_bias))); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 3fa41cfe6d..1a30799e1d 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -406,7 +406,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, } template -void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, +void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *input_ptr, @@ -461,11 +461,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { @@ -492,7 +492,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, int n_ = num_tokens; int k_ = m->oProjSize; int lda = k_; - int ldb = n_; + int ldb = k_; int ldc = m_; float alpha = 1.0f, beta = 0.0f; // matrix A: output projection weight @@ -634,18 +634,18 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, c_param, h_param, w_param)); - checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - m->qk_tensor, - m->softmax_activation_buffer, - m->qk_tensor, - m->qk_prods_softmax, - &beta, - m->qk_tensor, - m->qk_prods)); - // TODO: fill all elements above diagonal to force causal attention + // checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + // CUDNN_SOFTMAX_ACCURATE, + // CUDNN_SOFTMAX_MODE_CHANNEL, + // &alpha, + // m->qk_tensor, + // m->softmax_activation_buffer, + // m->qk_tensor, + // m->qk_prods_softmax, + // &beta, + // m->qk_tensor, + // m->qk_prods)); + // TODO: fill all elements above diagonal to force causal attention } // Step 5: compute gradients w.r.t. key { @@ -825,6 +825,24 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, } } +template +__global__ void store_query_cache(DT const *devQKVProjArray, + DT *qCache_ptr, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; + + DT qVal = devQKVProjArray[val_idx]; + + // query cache + qCache_ptr[i] = qVal; + } +} + template __global__ void fill_entries_above_diagonal(DT *matrix, size_t num_rows, @@ -843,7 +861,7 @@ __global__ void fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT *output_ptr, @@ -882,6 +900,23 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; + // Copy query to m->query_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = allocator->allocate_instance_untyped( + sizeof(DT) * total_tokens * m->num_q_heads * m->qProjSize); + int parallelism = m->hidden_size * num_tokens; + store_query_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->query_activation_buffer), + num_tokens, + m->hidden_size); + } + // bc->token_last_available_idx[i] + 1; // Compute (QK^T/sqrt(d_k)) // a flag of using this scaling alpha @@ -995,6 +1030,20 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, &softmax_beta, m->qk_tensor, C_softmax)); + // Copy C_softmax to m->softmax_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = allocator->allocate_instance_untyped( + sizeof(DT) * total_tokens * num_new_tokens * m->num_q_heads); + checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer, + C_softmax, + sizeof(DT) * total_tokens * num_new_tokens * + m->num_q_heads, + cudaMemcpyDeviceToDevice, + stream)); + } + // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; m_ = m->vProjSize; @@ -1090,7 +1139,7 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( - IncMultiHeadSelfAttentionMeta const *m, + IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, @@ -1193,7 +1242,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( /*static*/ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( - IncMultiHeadSelfAttentionMeta const *m, + IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorW const &input_grad, diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 504380736f..2e8761472f 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -370,11 +370,12 @@ void peft_bwd_kernel(LinearMeta const *m, hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); // update input_grad_ptr offset input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens; -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = hipblas_data_type; #else - hipblasDatatype_t compute_type = HIPBLAS_R_32F; + // TODO: currently use the hipblas_data_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = output_type; #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 525fdf4d11..4627179fc4 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -455,11 +455,11 @@ void peft_bwd_kernel(LinearMeta const *m, input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens * in_dim; output_grad_ptr = static_cast
(output_grad_ptr) + num_infr_tokens * out_dim; -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = output_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = CUDA_R_32F; #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index fd64c4710b..282d0efc7e 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -145,12 +145,11 @@ void inference_kernel(LoraLinearMeta *m, cudaDataType_t lr_actv_type = output_type; assert(input_type == output_type); cudaDataType_t weight_type = output_type; - -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = output_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = output_type; #endif int num_peft_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -263,13 +262,12 @@ void peft_bwd_kernel(LoraLinearMeta *m, assert(input_type == output_type); cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; -#if CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = output_type; #endif - for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5631ea6523..4128fee220 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -357,10 +357,10 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request( GenerationResult gr; gr.guid = request.guid; - //gr.input_text = prompt; - //gr.input_tokens = request.tokens; - //gr.output_text = prompt; - //gr.output_tokens = request.tokens; + // gr.input_text = prompt; + // gr.input_tokens = request.tokens; + // gr.output_text = prompt; + // gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; return request.guid; } @@ -579,32 +579,37 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); assert(request.dataset.size() > 0); - int num_peft_tokens = request.dataset[0].first.size() - + request.dataset[0].second.size(); - if (num_peft_tokens + new_bc.num_active_tokens() <= get_max_tokens_per_batch()) { + int num_peft_tokens = + request.dataset[0].first.size() + request.dataset[0].second.size(); + if (num_peft_tokens + new_bc.num_active_tokens() <= + get_max_tokens_per_batch()) { // The last request slot is reserved for PEFT request int peft_req_idx = get_max_requests_per_batch() - 1; assert(new_bc.request_completed[peft_req_idx]); new_bc.request_completed[peft_req_idx] = false; new_bc.requestsInfo[peft_req_idx].first_token_depth_in_request = 0; - new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch = + new_bc.num_tokens; new_bc.requestsInfo[peft_req_idx].num_tokens_in_batch = num_peft_tokens; - new_bc.requestsInfo[peft_req_idx].max_sequence_length = request.max_sequence_length; + new_bc.requestsInfo[peft_req_idx].max_sequence_length = + request.max_sequence_length; new_bc.requestsInfo[peft_req_idx].request_guid = request.guid; new_bc.requestsInfo[peft_req_idx].peft_model_id = request.peft_model_id; new_bc.requestsInfo[peft_req_idx].peft_bwd = true; for (size_t i = 0; i < request.dataset[0].first.size(); i++) { - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].first[i]; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.dataset[0].first[i]; new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i; - new_bc.num_tokens ++; + new_bc.num_tokens++; } for (size_t i = 0; i < request.dataset[0].second.size(); i++) { - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].second[i]; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.dataset[0].second[i]; new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens; int depth = request.dataset[0].first.size() + i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - new_bc.num_tokens ++; + new_bc.num_tokens++; } } } @@ -2041,11 +2046,11 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, } /*static*/ -GenerationResult - RequestManager::generate_incr_decoding(FFModel *llm, - std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id) { +GenerationResult RequestManager::generate_incr_decoding( + FFModel *llm, + std::vector const &prompts, + int max_seq_length, + PEFTModelID peft_model_id) { InferenceManager *im = InferenceManager::get_inference_manager(); RequestGuid guid; for (int i = 0; i < prompts.size(); i++) { From 45c1e0105a77299a54ed9cb812040869ca424a55 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 23 Oct 2023 21:58:03 -0400 Subject: [PATCH 043/198] uncomment softmaxbackward --- src/ops/inc_multihead_self_attention.cu | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 1a30799e1d..b83d23804c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -634,17 +634,17 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, c_param, h_param, w_param)); - // checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, - // CUDNN_SOFTMAX_ACCURATE, - // CUDNN_SOFTMAX_MODE_CHANNEL, - // &alpha, - // m->qk_tensor, - // m->softmax_activation_buffer, - // m->qk_tensor, - // m->qk_prods_softmax, - // &beta, - // m->qk_tensor, - // m->qk_prods)); + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->qk_tensor, + m->softmax_activation_buffer, + m->qk_tensor, + m->qk_prods_softmax, + &beta, + m->qk_tensor, + m->qk_prods)); // TODO: fill all elements above diagonal to force causal attention } // Step 5: compute gradients w.r.t. key From 07636e8f89ab470c2f9216be17d4ccfc444da9dc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 24 Oct 2023 15:53:29 -0400 Subject: [PATCH 044/198] add layernorm to align test --- tests/align/test_all_operators.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh index 3fb361f25c..73b0cb30dc 100755 --- a/tests/align/test_all_operators.sh +++ b/tests/align/test_all_operators.sh @@ -11,7 +11,7 @@ function generate_torch_tensor(){ python tests/align/align_create_tensor_torch.py -o "$1" } -ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear gather) +ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear layernorm gather) #create flexflow tensors conda activate flexflow From 28a5e84a68b2355478530e40484741ef6dbaab3e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 24 Oct 2023 17:25:41 -0400 Subject: [PATCH 045/198] add peft test scripts --- tests/peft/hf_finetune.py | 120 ++++++++++++++++++++++++++++++++++++++ tests/peft/hf_serve.py | 51 ++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 tests/peft/hf_finetune.py create mode 100644 tests/peft/hf_serve.py diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py new file mode 100644 index 0000000000..981e4b0a1f --- /dev/null +++ b/tests/peft/hf_finetune.py @@ -0,0 +1,120 @@ +import os, sys +#os.environ["CUDA_VISIBLE_DEVICES"]="0" +import torch +import torch.nn as nn +#import bitsandbytes as bnb +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer +import argparse +from peft import LoraConfig, get_peft_model +import transformers +from datasets import load_dataset + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): + return super().forward(x).to(torch.float32) + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf") + parser.add_argument("--lora-rank", type=int, default=16) + parser.add_argument("--lora-alpha", type=int, default=32) + parser.add_argument("--lora-dropout", type=float, default=0.05) + parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") + parser.add_argument("--output-dir", type=str, default="./finetuned-llama") + args = parser.parse_args() + model_name = args.model_name + use_full_precision=args.use_full_precision + lora_rank = args.lora_rank + lora_alpha = args.lora_alpha + lora_dropout = args.lora_dropout + output_dir = args.output_dir + + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + model = AutoModelForCausalLM.from_pretrained( + model_name, + #load_in_8bit=True, + torch_dtype = torch.float32 if use_full_precision else torch.float16, + device_map='auto', + ) + + # Get Tokenizer + hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained(model_name, use_fast=True, torch_dtype = torch.float32 if use_full_precision else torch.float16,) + else: + tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype = torch.float32 if use_full_precision else torch.float16,) + for param in model.parameters(): + param.requires_grad = False # freeze the model - train adapters later + if param.ndim == 1: + # cast the small parameters (e.g. layernorm) to fp32 for stability + param.data = param.data.to(torch.float32) + + model.gradient_checkpointing_enable() # reduce number of stored activations + model.enable_input_require_grads() + + model.lm_head = CastOutputToFloat(model.lm_head) + + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + #target_modules=["q_proj", "v_proj"], + target_modules=["down_proj"], + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM" + ) + print(model) + print(model.named_parameters()) + model = get_peft_model(model, config) + print_trainable_parameters(model) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: tokenizer(samples['quote']), batched=True) + + trainer = transformers.Trainer( + model=model, + train_dataset=data['train'], + args=transformers.TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=100, + max_steps=200, + learning_rate=2e-4, + fp16=True if not use_full_precision else False, + logging_steps=1, + output_dir=os.path.join(output_dir, "logs"), + ), + data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) + ) + model.config.use_cache = False # silence the warnings. Please re-enable for inference! + trainer.train() + + print(f"Done fine-tuning! Saving the model to {output_dir}...") + model.save_pretrained(output_dir) + + # Upload to HF hub + #from huggingface_hub import notebook_login + #notebook_login() + #model.push_to_hub("goliaro/llama-7b-lora-half", use_auth_token=True) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py new file mode 100644 index 0000000000..677ccc6eeb --- /dev/null +++ b/tests/peft/hf_serve.py @@ -0,0 +1,51 @@ +import argparse +import torch +from peft import PeftModel, PeftConfig +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama") + parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") + parser.add_argument("--max-new-tokens", type=int, default=50) + args = parser.parse_args() + peft_model_id = args.peft_model_id + #peft_model_id = "goliaro/llama-7b-lora-half" + use_full_precision=args.use_full_precision + max_new_tokens = args.max_new_tokens + + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + config = PeftConfig.from_pretrained(peft_model_id) + model = AutoModelForCausalLM.from_pretrained( + config.base_model_name_or_path, + return_dict=True, + #load_in_8bit=True, + torch_dtype = torch.float32 if use_full_precision else torch.float16, + device_map='auto', + ) + hf_config = AutoConfig.from_pretrained(config.base_model_name_or_path, trust_remote_code=True) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + config.base_model_name_or_path, use_fast=True, + torch_dtype = torch.float32 if use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + config.base_model_name_or_path, + torch_dtype = torch.float32 if use_full_precision else torch.float16, + ) + + # Load the Lora model + model = PeftModel.from_pretrained(model, peft_model_id) + batch = tokenizer("Two things are infinite: ", return_tensors='pt') + with torch.cuda.amp.autocast(): + output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens) + print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) + +if __name__ == "__main__": + main() From dd9437063fcbcd65103429f8665cb03ccc00e83a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 24 Oct 2023 17:25:57 -0400 Subject: [PATCH 046/198] fix import --- tests/peft/hf_serve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 677ccc6eeb..6f3753906f 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -1,5 +1,6 @@ import argparse import torch +import os, sys from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer From 3c013281adf4f98881a7d2d351b3e261c1599538 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 24 Oct 2023 22:06:18 +0000 Subject: [PATCH 047/198] fix --- tests/peft/hf_finetune.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 981e4b0a1f..14aad1b9cc 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -62,6 +62,10 @@ def main(): tokenizer = LlamaTokenizer.from_pretrained(model_name, use_fast=True, torch_dtype = torch.float32 if use_full_precision else torch.float16,) else: tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype = torch.float32 if use_full_precision else torch.float16,) + if tokenizer.pad_token is None: + tokenizer.pad_token = "[PAD]" + tokenizer.padding_side = "left" + for param in model.parameters(): param.requires_grad = False # freeze the model - train adapters later if param.ndim == 1: From fa56364a04c27bd86f83a1676e7f097bb27b4a79 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 26 Oct 2023 15:47:14 +0000 Subject: [PATCH 048/198] add code to convert peft models --- conda/flexflow.yml | 6 + python/flexflow/serve/models/base.py | 9 +- python/flexflow/serve/models/falcon.py | 30 +-- python/flexflow/serve/models/llama.py | 51 +++--- python/flexflow/serve/models/mpt.py | 27 +-- python/flexflow/serve/models/opt.py | 45 ++--- python/flexflow/serve/models/starcoder.py | 14 +- python/flexflow/serve/serve.py | 213 ++++++++++++++++++---- requirements.txt | 7 + 9 files changed, 292 insertions(+), 110 deletions(-) diff --git a/conda/flexflow.yml b/conda/flexflow.yml index c9226269f2..3e39407bfa 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -25,3 +25,9 @@ dependencies: - sentencepiece - einops - requests + - scipy + - bitsandbytes + - datasets + - accelerate + - loralib + - peft diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py index 025008ec78..17bb894250 100644 --- a/python/flexflow/serve/models/base.py +++ b/python/flexflow/serve/models/base.py @@ -21,9 +21,9 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, - #max_tokens_per_batch=64, + # max_batch_size=1, + # max_seq_length=256, + # max_tokens_per_batch=64, weights_filepath="", tokenizer_filepath="", ): @@ -32,5 +32,8 @@ def __init__( def build_model(self): assert False, "Not implemented yet" + def convert_hf_weight_name(name): + assert False, "Not implemented yet" + def convert_hf_model(model, dst_folder): assert False, "Not implemented yet" diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 2b114f09b3..eafce814e1 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -19,8 +19,8 @@ class FalconConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.bias = hf_config.bias @@ -53,8 +53,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -62,11 +62,11 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.falcon_config = FalconConfig(hf_config) - #self.falcon_config.max_seq_length = max_seq_length - #self.falcon_config.max_num_tokens = max_tokens_per_batch + # self.falcon_config.max_seq_length = max_seq_length + # self.falcon_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -235,6 +235,15 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return ( + name.replace(".", "_") + .replace("transformer_h_", "layers_") + .replace("transformer_", "") + .replace("self_attention_dense", "attention_wo") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) n_head = ( @@ -243,12 +252,7 @@ def convert_hf_model(model, dst_folder): else model.config.num_attention_heads ) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("transformer_h_", "layers_") - .replace("transformer_", "") - .replace("self_attention_dense", "attention_wo") - ) + name = FlexFlowFalcon.convert_hf_weight_name(name) # Split Q,K,V attention weights if "self_attention_query_key_value" in name: name_q = name.replace("self_attention_query_key_value", "attention_wq") diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 7ba0e78a37..ba5f1df7a2 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -19,8 +19,8 @@ class LLAMAConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.num_hidden_layers = hf_config.num_hidden_layers @@ -45,8 +45,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -54,11 +54,11 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.llama_config = LLAMAConfig(hf_config) - #self.llama_config.max_seq_length = max_seq_length - #self.llama_config.max_num_tokens = max_tokens_per_batch + # self.llama_config.max_seq_length = max_seq_length + # self.llama_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -242,24 +242,27 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return ( + name.replace(".", "_") + .replace("self_attn", "attention") + .replace("q_proj", "wq") + .replace("k_proj", "wk") + .replace("v_proj", "wv") + .replace("o_proj", "wo") + .replace("mlp", "feed_forward") + .replace("gate_proj", "w1") + .replace("down_proj", "w2") + .replace("up_proj", "w3") + .replace("input_layernorm", "attention_norm") + .replace("post_attention_layernorm", "ffn_norm") + .replace("embed_tokens", "tok_embeddings") + .replace("lm_head", "output") + .replace("model_", "") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) + name = FlexFlowLLAMA.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 79a5bb940f..91d87669ca 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -19,8 +19,8 @@ class MPTConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.hidden_size = hf_config.d_model @@ -40,8 +40,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -49,11 +49,11 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.mpt_config = MPTConfig(hf_config) - #self.mpt_config.max_seq_length = max_seq_length - #self.mpt_config.max_num_tokens = max_tokens_per_batch + # self.mpt_config.max_seq_length = max_seq_length + # self.mpt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -245,10 +245,18 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return ( + name.replace("transformer.blocks.", "layers.") + .replace(".", "_") + .replace("attn_out_proj", "attention_wo") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.blocks.", "layers.").replace(".", "_") + name = FlexFlowMPT.convert_hf_weight_name(name) if "Wqkv" in name: name_q = name.replace("attn_Wqkv", "attention_wq") name_k = name.replace("attn_Wqkv", "attention_wk") @@ -265,9 +273,6 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "out_proj" in name: - name = name.replace("attn_out_proj", "attention_wo") - params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index dfd1cde7d4..8250c63a9a 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -19,8 +19,8 @@ class OPTConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.do_layer_norm_before = hf_config.do_layer_norm_before @@ -46,8 +46,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -55,11 +55,11 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.opt_config = OPTConfig(hf_config) - #self.opt_config.max_seq_length = max_seq_length - #self.opt_config.max_num_tokens = max_tokens_per_batch + # self.opt_config.max_seq_length = max_seq_length + # self.opt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -276,23 +276,26 @@ def build_model(self, max_tokens_per_batch): self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return ( + name.replace(".", "_") + .replace("decoder_", "") + .replace("model_", "") + .replace("self_attn", "attention") + .replace("q_proj", "wq") + .replace("k_proj", "wk") + .replace("v_proj", "wv") + .replace("out_proj", "wo") + .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias") + .replace( + "_final_layer_norm", "_add_bias_residual_layer_norm" + ) # important to use the leading "_" to avoid matching the last LayerNorm + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("decoder_", "") - .replace("model_", "") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("out_proj", "wo") - .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias") - .replace( - "_final_layer_norm", "_add_bias_residual_layer_norm" - ) # important to use the leading "_" to avoid matching the last LayerNorm - ) + name = FlexFlowOPT.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights shutil.copy( diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index f4f28a70e1..0f577299ed 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -19,8 +19,8 @@ class STARCODERConfig: def __init__(self, hf_config): - #self.max_seq_len = 256 - #self.max_num_tokens = 64 + # self.max_seq_len = 256 + # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.dropout_p = hf_config.attn_pdrop @@ -44,8 +44,8 @@ def __init__( ffconfig, hf_config, data_type, - #max_batch_size=1, - #max_seq_length=256, + # max_batch_size=1, + # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -53,11 +53,11 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - #self.max_batch_size = max_batch_size + # self.max_batch_size = max_batch_size self.data_type = data_type self.starcoder_config = STARCODERConfig(hf_config) - #self.starcoder_config.max_seq_length = max_seq_length - #self.starcoder_config.max_num_tokens = max_tokens_per_batch + # self.starcoder_config.max_seq_length = max_seq_length + # self.starcoder_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 549677d77a..1c9ece27ef 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -28,8 +28,9 @@ ) from flexflow.core import * from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer +from peft import PeftModel, PeftConfig from huggingface_hub import HfApi -import sys, torch, shutil, hashlib +import sys, torch, shutil, hashlib, json from typing import Union, List @@ -68,6 +69,36 @@ def __init__(self, text: str = None, tokens: list = None): self.output_tokens = tokens +class _SupportedModels: + def __init__( + self, + ): + self.supported_models = { + "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), + "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "GPTBigCodeForCausalLM": ( + ModelType.STARCODER, + FlexFlowSTARCODER, + STARCODERConfig, + ), + "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig), + } + + def get_ff_model_type(self, hf_config): + architectures = getattr(hf_config, "architectures", []) + ff_arch = None + if next(iter(architectures), None) is not None: + ff_arch = self.supported_models.get(architectures[0]) + if ff_arch is None: + raise ValueError( + f"Huggingface model of type {architectures} is not yet supported by FlexFlow" + ) + return ff_arch + + class LLM: """This class creates a LLM (Large-Language Model) object based on a model from HuggingFace""" @@ -92,44 +123,20 @@ def __init__( :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" :type output_file: str, optional """ - self.supported_models = { - "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), - "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), - "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), - "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), - "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), - "GPTBigCodeForCausalLM": ( - ModelType.STARCODER, - FlexFlowSTARCODER, - STARCODERConfig, - ), - "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig), - } + self.supported_models = _SupportedModels() self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) self.model_name = self.hf_config._name_or_path ( self.model_type, self.model_class, self.config_class, - ) = self.__get_ff_model_type() + ) = self.supported_models.get_ff_model_type(self.hf_config) self.data_type = data_type assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache self.output_file = output_file - def __get_ff_model_type(self): - architectures = getattr(self.hf_config, "architectures", []) - ff_arch = None - if next(iter(architectures), None) is not None: - ff_arch = self.supported_models.get(architectures[0]) - if ff_arch is None: - print( - f"Huggingface model of type {architectures} is not yet supported by FlexFlow" - ) - sys.exit(1) - return ff_arch - def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" self.config_dir = os.path.join( @@ -334,9 +341,9 @@ def compile( :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ - #self.max_requests_per_batch = max_requests_per_batch - #self.max_seq_length = max_seq_length - #self.max_tokens_per_batch = max_tokens_per_batch + # self.max_requests_per_batch = max_requests_per_batch + # self.max_seq_length = max_seq_length + # self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.generation_config = GenerationConfig() self.ffconfig = FFConfig() @@ -376,7 +383,7 @@ def compile( self.ffconfig, self.hf_config, self.data_type, - max_tokens_per_batch + max_tokens_per_batch, ) # Create inference manager @@ -500,3 +507,147 @@ def compile( model_specific_pipeline_parallelism_degree, ssms, ) + + +class PEFT: + """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM""" + + def __init__( + self, + peft_model_id: str, + data_type: DataType = DataType.DT_HALF, + cache_path: str = "", + refresh_cache: bool = False, + ): + self.hf_config = PeftConfig.from_pretrained(peft_model_id) + self.peft_model_id = peft_model_id + self.peft_type: self.hf_config.peft_type + if self.peft_type != "LORA": + raise RuntimeError( + f"PEFT type {self.peft_type} not yet supported in FlexFlow" + ) + self.data_type = data_type + assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT + self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" + self.refresh_cache = refresh_cache + # Base model related + self.supported_base_models = _SupportedModels() + if "base_model_name_or_path" not in self.hf_config.to_dict(): + raise ValueError( + f"PEFT model {peft_model_id} does not have an associated based model" + ) + self.base_model_hf_config = AutoConfig.from_pretrained( + self.hf_config.base_model_name_or_path, trust_remote_code=True + ) + ( + self.base_model_type, + self.base_model_class, + self.base_config_class, + ) = self.supported_base_models.get_ff_model_type(self.base_model_hf_config) + + def download_hf_config(self): + """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" + self.config_dir = os.path.join( + os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower() + ) + self.config_path = os.path.join(self.config_dir, "config.json") + os.makedirs(self.config_dir, exist_ok=True) + print(f"Creating directory {self.config_dir} (if it doesn't exist)...") + print(f"Saving {self.peft_model_id} configs to file {self.config_path}...") + with open(self.config_path, "w") as json_file: + json.dump(self.hf_config.to_dict(), json_file, indentation=2) + + def __get_revision_hashes(self, peft_model_id: str): + ff_revision = None + ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt") + if os.path.exists(ff_revision_file): + ff_revision = "".join(open(ff_revision_file).read().split()) + + if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id): + # Local model + files = os.listdir(peft_model_id) + state = files + [ + os.path.getmtime(os.path.join(peft_model_id, f)) for f in files + ] + latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest() + else: + # Remote HuggingFace model + hf_api = HfApi() + latest_revision = hf_api.model_info(self.peft_model_id).sha + return ff_revision, ff_revision_file, latest_revision + + def convert_peft_model(self, hf_peft_model, weights_path): + for name, params in hf_peft_model.named_parameters(): + name = name.replace("base_model.model.model.", "").replace(".default", "") + name = self.base_model_class.convert_hf_weight_name(name) + params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + + def download_hf_weights_if_needed(self): + """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date. + If not, or if the refresh_cache parameter is set to True, download new weights. + """ + if self.data_type == DataType.DT_HALF: + torch.set_default_tensor_type(torch.HalfTensor) + elif self.data_type == DataType.DT_FLOAT: + torch.set_default_tensor_type(torch.FloatTensor) + else: + assert False, "Data type not yet supported -- cannot download weights!" + + # Use local cache, or download new version + self.weights_path = os.path.join( + os.path.expanduser(self.cache_path), + "weights", + self.peft_model_id.lower(), + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision", + ) + if self.refresh_cache: + print( + f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..." + ) + if os.path.exists(self.weights_path): + shutil.rmtree(self.weights_path) + os.makedirs(self.weights_path, exist_ok=True) + print(f"Creating directory {self.weights_path} (if it doesn't exist)...") + + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.peft_model_id + ) + + # Download if needed + if ff_revision != latest_revision: + if not os.path.exists(self.peft_model_id) or os.path.isdir( + self.peft_model_id + ): + # Local model + print( + f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..." + ) + else: + # Remote model + print( + f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..." + ) + # Download model from HuggingFace, or load it from the local folder + hf_model = AutoModelForCausalLM.from_pretrained( + self.hf_config.base_model_name_or_path, + return_dict=True, + trust_remote_code=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + device_map="auto", + ) + hf_peft_model = PeftModel.from_pretrained(hf_model, self.peft_model_id) + # Print log message to notify user download of model has finished + if not os.path.exists(self.peft_model_id) or os.path.isdir( + self.peft_model_id + ): + print("Done downloading HF weights. Converting them now...") + # Convert the model to FlexFlow format + self.__convert_peft_model(hf_peft_model, self.weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print("Done converting the weights...") + else: + print(f"Loading '{self.peft_model_id}' model weights from the cache...") diff --git a/requirements.txt b/requirements.txt index 1037661337..43df6a2975 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,10 @@ onnx transformers>=4.31.0 sentencepiece einops +# peft-related +scipy +bitsandbytes +datasets +accelerate +loralib +peft From a4841008d0532c0d8f719339cef560134b1087a2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 26 Oct 2023 16:15:47 +0000 Subject: [PATCH 049/198] add script to download peft for c++, fix bug --- inference/utils/download_peft_model.py | 59 ++++++++++++++++++++++++++ python/flexflow/serve/serve.py | 2 +- 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 inference/utils/download_peft_model.py diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py new file mode 100644 index 0000000000..1204634388 --- /dev/null +++ b/inference/utils/download_peft_model.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +import flexflow.serve as ff +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download" + ) + parser.add_argument( + "--cache-folder", + type=str, + help="Folder to use to store the model(s) assets in FlexFlow format", + default="", + ) + parser.add_argument( + "--refresh-cache", + action="store_true", + help="Use this flag to force the refresh of the model(s) weights/tokenizer cache", + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--full-precision-only", + action="store_true", + help="Only download the full precision version of the weights", + ) + group.add_argument( + "--half-precision-only", + action="store_true", + help="Only download the half precision version of the weights", + ) + args = parser.parse_args() + return args + + +def main(args): + if args.full_precision_only: + data_types = ff.DataType.DT_FLOAT + elif args.half_precision_only: + data_types = ff.DataType.DT_HALF + else: + data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) + + for peft_model_id in args.peft_model_ids: + for data_type in data_types: + peft = ff.PEFT( + peft_model_id, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + peft.download_hf_weights_if_needed() + peft.download_hf_config() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 1c9ece27ef..19f7f089b7 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -634,7 +634,7 @@ def download_hf_weights_if_needed(self): self.hf_config.base_model_name_or_path, return_dict=True, trust_remote_code=True, - torch_dtype=torch.float32 if use_full_precision else torch.float16, + torch_dtype=torch.float32 if self.data_type == DataType.DT_FLOAT else torch.float16, device_map="auto", ) hf_peft_model = PeftModel.from_pretrained(hf_model, self.peft_model_id) From c83c376ccfc6fb6a322836ba56bb6da006b4323b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 26 Oct 2023 17:05:37 +0000 Subject: [PATCH 050/198] fix --- python/flexflow/serve/__init__.py | 2 +- python/flexflow/serve/serve.py | 71 ++++++++++++++++--------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index cf467280bd..274b431ad8 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -15,7 +15,7 @@ from typing import Optional from ..type import * from flexflow.core import * -from .serve import LLM, SSM, GenerationConfig, GenerationResult +from .serve import LLM, SSM, PEFT, GenerationConfig, GenerationResult def __check_positive_int(configs_dict: dict, key: str): diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 19f7f089b7..e0e1b2e155 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -30,7 +30,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer from peft import PeftModel, PeftConfig from huggingface_hub import HfApi -import sys, torch, shutil, hashlib, json +import torch, shutil, hashlib, json, gc from typing import Union, List @@ -175,13 +175,6 @@ def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. """ - if self.data_type == DataType.DT_HALF: - torch.set_default_tensor_type(torch.HalfTensor) - elif self.data_type == DataType.DT_FLOAT: - torch.set_default_tensor_type(torch.FloatTensor) - else: - assert False, "Data type not yet supported -- cannot download weights!" - # Use local cache, or download new version self.weights_path = os.path.join( os.path.expanduser(self.cache_path), @@ -218,7 +211,11 @@ def download_hf_weights_if_needed(self): ) # Download model from HuggingFace, or load it from the local folder hf_model = AutoModelForCausalLM.from_pretrained( - self.model_name, trust_remote_code=True + self.model_name, + trust_remote_code=True, + torch_dtype=torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16, ) # Print log message to notify user download of model has finished if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): @@ -229,6 +226,10 @@ def download_hf_weights_if_needed(self): with open(ff_revision_file, "w+") as f: f.write(latest_revision) print("Done converting the weights...") + # Deallocate hf model + del hf_model + gc.collect() + torch.cuda.empty_cache() else: print(f"Loading '{self.model_name}' model weights from the cache...") @@ -521,7 +522,7 @@ def __init__( ): self.hf_config = PeftConfig.from_pretrained(peft_model_id) self.peft_model_id = peft_model_id - self.peft_type: self.hf_config.peft_type + self.peft_type = self.hf_config.peft_type if self.peft_type != "LORA": raise RuntimeError( f"PEFT type {self.peft_type} not yet supported in FlexFlow" @@ -531,19 +532,13 @@ def __init__( self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache # Base model related - self.supported_base_models = _SupportedModels() if "base_model_name_or_path" not in self.hf_config.to_dict(): raise ValueError( f"PEFT model {peft_model_id} does not have an associated based model" ) - self.base_model_hf_config = AutoConfig.from_pretrained( - self.hf_config.base_model_name_or_path, trust_remote_code=True + self.base_model = LLM( + self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache ) - ( - self.base_model_type, - self.base_model_class, - self.base_config_class, - ) = self.supported_base_models.get_ff_model_type(self.base_model_hf_config) def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" @@ -555,7 +550,7 @@ def download_hf_config(self): print(f"Creating directory {self.config_dir} (if it doesn't exist)...") print(f"Saving {self.peft_model_id} configs to file {self.config_path}...") with open(self.config_path, "w") as json_file: - json.dump(self.hf_config.to_dict(), json_file, indentation=2) + json.dump(self.hf_config.to_dict(), json_file, indent=2) def __get_revision_hashes(self, peft_model_id: str): ff_revision = None @@ -578,21 +573,17 @@ def __get_revision_hashes(self, peft_model_id: str): def convert_peft_model(self, hf_peft_model, weights_path): for name, params in hf_peft_model.named_parameters(): - name = name.replace("base_model.model.model.", "").replace(".default", "") - name = self.base_model_class.convert_hf_weight_name(name) - params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + if self.peft_type.lower() in name: + name = name.replace("base_model.model.model.", "").replace( + ".default", "" + ) + name = self.base_model.model_class.convert_hf_weight_name(name) + params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. """ - if self.data_type == DataType.DT_HALF: - torch.set_default_tensor_type(torch.HalfTensor) - elif self.data_type == DataType.DT_FLOAT: - torch.set_default_tensor_type(torch.FloatTensor) - else: - assert False, "Data type not yet supported -- cannot download weights!" - # Use local cache, or download new version self.weights_path = os.path.join( os.path.expanduser(self.cache_path), @@ -629,25 +620,35 @@ def download_hf_weights_if_needed(self): print( f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..." ) - # Download model from HuggingFace, or load it from the local folder - hf_model = AutoModelForCausalLM.from_pretrained( + # Download base model from HuggingFace, or load it from the local folder + self.base_model.download_hf_weights_if_needed() + self.base_model.download_hf_tokenizer_if_needed() + self.base_model.download_hf_config() + hf_base_model = AutoModelForCausalLM.from_pretrained( self.hf_config.base_model_name_or_path, return_dict=True, trust_remote_code=True, - torch_dtype=torch.float32 if self.data_type == DataType.DT_FLOAT else torch.float16, - device_map="auto", + torch_dtype=torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16, + # device_map="auto", ) - hf_peft_model = PeftModel.from_pretrained(hf_model, self.peft_model_id) + hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id) # Print log message to notify user download of model has finished if not os.path.exists(self.peft_model_id) or os.path.isdir( self.peft_model_id ): print("Done downloading HF weights. Converting them now...") # Convert the model to FlexFlow format - self.__convert_peft_model(hf_peft_model, self.weights_path) + self.convert_peft_model(hf_peft_model, self.weights_path) # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) print("Done converting the weights...") + # Deallocate hf model + del hf_peft_model + del hf_base_model + gc.collect() + torch.cuda.empty_cache() else: print(f"Loading '{self.peft_model_id}' model weights from the cache...") From aa9f0046f74af4368c5d9a715549e906861f9b03 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 27 Oct 2023 05:08:19 +0000 Subject: [PATCH 051/198] add script to fine-tune models --- .gitignore | 4 +++- tests/peft/fine_tune.sh | 19 +++++++++++++++++++ tests/peft/hf_finetune.py | 27 +++++++++++++++++---------- 3 files changed, 39 insertions(+), 11 deletions(-) create mode 100755 tests/peft/fine_tune.sh diff --git a/.gitignore b/.gitignore index 8fcc105f01..a032f80f77 100644 --- a/.gitignore +++ b/.gitignore @@ -186,4 +186,6 @@ gpt_tokenizer # pip version python/flexflow/version.txt -inference_tensors \ No newline at end of file +inference_tensors + +Untitled-1.ipynb \ No newline at end of file diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh new file mode 100755 index 0000000000..dbcdb849fa --- /dev/null +++ b/tests/peft/fine_tune.sh @@ -0,0 +1,19 @@ +#! /usr/bin/env bash +set -e +set -x + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full +python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half +python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full +python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half + +python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full +python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half + +python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full +python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half +python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full +python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 14aad1b9cc..d702d23038 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -32,16 +32,22 @@ def main(): parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf") parser.add_argument("--lora-rank", type=int, default=16) parser.add_argument("--lora-alpha", type=int, default=32) + parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target") parser.add_argument("--lora-dropout", type=float, default=0.05) parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") - parser.add_argument("--output-dir", type=str, default="./finetuned-llama") + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--publish-peft-with-id", type=str, default="") args = parser.parse_args() model_name = args.model_name use_full_precision=args.use_full_precision lora_rank = args.lora_rank lora_alpha = args.lora_alpha + lora_target_modules = args.lora_target_modules.split(",") lora_dropout = args.lora_dropout output_dir = args.output_dir + publish_peft_with_id = args.publish_peft_with_id + if len(output_dir) == 0 and len(publish_peft_with_id) == 0: + raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model") # Change working dir to folder storing this script abspath = os.path.abspath(__file__) @@ -81,7 +87,8 @@ def main(): r=lora_rank, lora_alpha=lora_alpha, #target_modules=["q_proj", "v_proj"], - target_modules=["down_proj"], + #target_modules=["down_proj"], + target_modules=lora_target_modules, lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM" @@ -105,20 +112,20 @@ def main(): learning_rate=2e-4, fp16=True if not use_full_precision else False, logging_steps=1, - output_dir=os.path.join(output_dir, "logs"), + output_dir=os.path.join(output_dir if len(output_dir) > 0 else "./", "lora_training_logs"), ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) ) model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train() - print(f"Done fine-tuning! Saving the model to {output_dir}...") - model.save_pretrained(output_dir) - - # Upload to HF hub - #from huggingface_hub import notebook_login - #notebook_login() - #model.push_to_hub("goliaro/llama-7b-lora-half", use_auth_token=True) + if len(output_dir) > 0: + print(f"Done fine-tuning! Saving the model to {output_dir}...") + model.save_pretrained(output_dir) + + if len(publish_peft_with_id) > 0: + print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...") + model.push_to_hub(publish_peft_with_id, use_auth_token=True) if __name__ == "__main__": main() \ No newline at end of file From 4609e9e33aec98c34ba9bae71d8e9141e46e2a89 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 31 Oct 2023 02:46:36 +0000 Subject: [PATCH 052/198] implement loading lora configs/weights from file --- include/flexflow/ops/lora_linear_params.h | 13 +++ inference/incr_decoding/incr_decoding.cc | 16 +++- inference/models/llama.cc | 7 +- src/ops/lora_linear.cc | 105 ++++++++++++++++++++-- src/ops/lora_linear_params.cc | 54 ++++++++++- src/ops/rms_norm.cc | 4 +- src/parallel_ops/combine.cc | 3 + 7 files changed, 188 insertions(+), 14 deletions(-) diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 46ee4ac6b7..e82243fd67 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -3,6 +3,7 @@ #include "flexflow/ffconst.h" #include "flexflow/fftype.h" +#include "flexflow/inference.h" #include "flexflow/op_meta.h" #include "flexflow/operator.h" #include "flexflow/parallel_tensor.h" @@ -16,19 +17,31 @@ class LoraLinearConfig { LoraLinearConfig(int rank, OptimizerType type = OPTIMIZER_TYPE_SGD, float learning_rate = 1e-4); + LoraLinearConfig(std::string const &cache_folder_, + std::string const &peft_model_id_); friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); + friend std::ostream &operator<<(std::ostream &os, + LoraLinearConfig const &llc); public: int rank; OptimizerType optimizer_type; float learning_rate; + std::string cache_folder; + // Huggingface + std::string peft_model_id; + int lora_alpha; + float lora_dropout; + // whether to load weights from file, instead of initializing them randomly + bool load_weights_from_file; }; class LoraLinearParams { public: LayerID layer_guid; OperatorType type; + char name[MAX_OPNAME]; bool is_valid(std::pair const &input_shape) const; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index b74292ad9d..0017fe3fcb 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -39,6 +39,7 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, std::string &llm_model_name, + std::string &peft_model_name, bool &use_full_precision, bool &verbose, bool &do_sample, @@ -56,6 +57,13 @@ void parse_input_args(char **argv, } continue; } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } // cache folder if (!strcmp(argv[i], "-cache-folder")) { paths.cache_folder_path = std::string(argv[++i]); @@ -124,7 +132,7 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "Doesn't support quantization in non-offload mode"); } FilePaths file_paths; - std::string llm_model_name; + std::string llm_model_name, peft_model_name; bool use_full_precision = false; bool verbose = false; bool do_sample = false; @@ -141,6 +149,7 @@ void FlexFlow::top_level_task(Task const *task, argc, file_paths, llm_model_name, + peft_model_name, use_full_precision, verbose, do_sample, @@ -258,7 +267,10 @@ void FlexFlow::top_level_task(Task const *task, } // Register PEFT layer - LoraLinearConfig mlp_second(4 /*rank*/); + LoraLinearConfig mlp_second = + peft_model_name.empty() + ? LoraLinearConfig::DefaultConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); PEFTModelID peft_model_id = model.register_peft_model( LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/); diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 72641161d1..9950d5b080 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -220,7 +220,12 @@ void LLAMA::create_llama_model(FFModel &ff, std::string("layers_" + std::to_string(i) + "_feed_forward_w2") .c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear(multi, w2, OP_LORA_MLP_SECOND); + ff.lora_linear( + multi, + w2, + OP_LORA_MLP_SECOND, + std::string("layers_" + std::to_string(i) + "_feed_forward_w2_lora") + .c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 2e356f7531..3d2d8d6106 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -6,6 +6,11 @@ #include "flexflow/utils/hash_utils.h" #include "flexflow/utils/peft_weight_allocator.h" #include "legion/legion_utilities.h" +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif namespace FlexFlow { @@ -86,7 +91,7 @@ LoraLinear::LoraLinear(FFModel &model, params.type, inputs.first, inputs.second, - name) {} + params.name) {} LoraLinear::LoraLinear(FFModel &model, LayerID const &_layer_guid, @@ -259,6 +264,34 @@ void LoraLinear::register_peft_model( fm.wait_all_results(); } +template +void load_peft_from_file(DT *ptr, + size_t size, + int shard_id, + std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(size); + size_t target_data_size = sizeof(DT) * size; + in.seekg(shard_id * target_data_size, in.beg); + in.read((char *)host_array.data(), target_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != target_data_size) { + printf("load weight data error: %lu, %lu, %lu\n", + in_get_size, + target_data_size, + sizeof(DT)); + assert(false); + } + assert(size == host_array.size()); + copy_kernel(ptr, host_array.data(), target_data_size); + in.close(); +} + void LoraLinear::register_model_task(Task const *task, std::vector const ®ions, Context ctx, @@ -267,10 +300,16 @@ void LoraLinear::register_model_task(Task const *task, static_cast(task->args); LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); LoraLinear const *lora = info->lora; + + int shard_id = task->index_point.point_data[0]; + int rank = info->lora_config.rank; int num_dims = lora->inputs[0]->num_dims; int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree; int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + DataType dt = m->input_type[0]; assert(dt == m->input_type[1]); assert(dt == m->output_type[0]); @@ -278,29 +317,71 @@ void LoraLinear::register_model_task(Task const *task, assert(dt == lora->inputs[1]->data_type); assert(dt == lora->outputs[0]->data_type); assert(m->model_weights.find(info->model_id) == m->model_weights.end()); + LoraLinearWeight weight; PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; weight.w0_ptr = allocator->allocate_local_weights_untyped( - info->model_id, rank * in_dim * data_type_size(dt)); + info->model_id, w0_num_elements * data_type_size(dt)); weight.w1_ptr = allocator->allocate_local_weights_untyped( - info->model_id, rank * out_dim * data_type_size(dt)); + info->model_id, w1_num_elements * data_type_size(dt)); + + // get layer name + assert(lora->name != nullptr && + "Layer name is not set, cannot determine weights location"); + std::string lora_layername = std::string(lora->name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + + // load weights from file + std::string weights_folder_filepath = join_path({ + info->lora_config.cache_folder, + "weights", + info->lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = + join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"}); + std::string w1_filepath = + join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"}); + if (dt == DT_FLOAT) { + load_peft_from_file( + (float *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath); + load_peft_from_file( + (float *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath); + } else if (dt == DT_HALF) { + load_peft_from_file( + (half *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath); + load_peft_from_file( + (half *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath); + } else { + assert(false && "Data type not supported"); + } + weight.rank = rank; if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { // Input is partitioned (no replication) // w0_grad is local weight gradients weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( - info->model_id, rank * in_dim * data_type_size(dt)); + info->model_id, w0_num_elements * data_type_size(dt)); // w1_grad is sync weight gradients weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( - info->model_id, rank * out_dim * data_type_size(dt)); + info->model_id, w1_num_elements * data_type_size(dt)); } else { // Input is replicated // w0_grad is sync weight gradients weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( - info->model_id, rank * in_dim * data_type_size(dt)); + info->model_id, w0_num_elements * data_type_size(dt)); // w1_grad is local weight gradients weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( - info->model_id, rank * out_dim * data_type_size(dt)); + info->model_id, w1_num_elements * data_type_size(dt)); } m->model_weights[info->model_id] = weight; } @@ -483,6 +564,8 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); sez.serialize(this->op_type); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } /* static */ @@ -494,15 +577,20 @@ Node LoraLinear::deserialize(FFModel &ff, assert(num_inputs == 2); size_t id, transformer_layer_id, deserialized_model_id; OperatorType op_type; + size_t name_len; + char name[MAX_OPNAME]; dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); dez.deserialize(op_type); + dez.deserialize(name_len); + dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); LoraLinearParams params; params.layer_guid = layer_guid; params.type = op_type; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } @@ -517,6 +605,9 @@ LoraLinearParams LoraLinear::get_params() const { LoraLinearParams params; params.layer_guid = this->layer_guid; params.type = this->op_type; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 80e7c6d64e..9d797aaed2 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -1,13 +1,50 @@ #include "flexflow/ops/lora_linear_params.h" +#include +#include +#include +using json = nlohmann::json; namespace FlexFlow { const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig(); LoraLinearConfig::LoraLinearConfig() - : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f) {} + : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f), + cache_folder(""), peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), + load_weights_from_file(false) {} LoraLinearConfig::LoraLinearConfig(int _rank, OptimizerType _type, float _lr) - : rank(_rank), optimizer_type(_type), learning_rate(_lr) {} + : rank(_rank), optimizer_type(_type), learning_rate(_lr), cache_folder(""), + peft_model_id(""), lora_alpha(0), lora_dropout(0.0f), + load_weights_from_file(false) {} + +LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_, + std::string const &peft_model_id_) { + cache_folder = cache_folder_; + peft_model_id = peft_model_id_; + std::string peft_inference_config_file_path = + join_path({cache_folder, "configs", peft_model_id, "config.json"}); + std::ifstream config_file(peft_inference_config_file_path); + if (config_file.is_open()) { + try { + json model_config; + config_file >> model_config; + rank = model_config["r"]; + lora_alpha = model_config["lora_alpha"]; + lora_dropout = model_config["lora_dropout"]; + } catch (json::exception const &e) { + std::cerr << "Error parsing PEFT config from JSON file: " << e.what() + << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file " << peft_inference_config_file_path + << std::endl; + assert(false); + } + optimizer_type = OPTIMIZER_TYPE_NONE; + learning_rate = 0.0f; + load_weights_from_file = true; +} bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type && @@ -17,4 +54,17 @@ bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { return false; } +std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { + os << "LoraLinearConfig: "; + os << "rank: " << llc.rank << ", "; + os << "optimizer_type: " << llc.optimizer_type << ", "; + os << "learning_rate: " << llc.learning_rate << ", "; + os << "cache_folder: " << llc.cache_folder << ", "; + os << "peft_model_id: " << llc.peft_model_id << ", "; + os << "lora_alpha: " << llc.lora_alpha << ", "; + os << "lora_dropout: " << llc.lora_dropout << ", "; + os << "load_weights_from_file: " << llc.load_weights_from_file << std::endl; + return os; +} + }; // namespace FlexFlow diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 3c1b4d2570..1a9bd7704e 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -543,14 +543,14 @@ Legion::FutureMap READ_WRITE, EXCLUSIVE, batch_inputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); // regions[2](I): weight launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 8411b42602..3433e2f21b 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -84,6 +84,9 @@ Combine::Combine(FFModel &model, dims[i] = _input->dims[i]; } assert(combine_degree > 0 && "Must use combine_degree > 0"); + std::cout << "combine_dim : " << combine_dim + << ", dims[combine_dim].degree: " << dims[combine_dim].degree + << ", combine_degree: " << combine_degree << std::endl; assert(dims[combine_dim].degree % combine_degree == 0); dims[combine_dim].degree /= combine_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); From 17fa6f3f514a0a6cdbf2179f6140bbe8b670ea3c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 31 Oct 2023 03:39:13 +0000 Subject: [PATCH 053/198] remove peft_bwd assertion failure in embedding --- include/flexflow/ops/embedding.h | 5 +++++ src/ops/embedding.cc | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index ae93ef4d1d..cd9ab4a775 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -60,6 +60,11 @@ class Embedding : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; // void update(const FFModel&); void print_layer(FFModel const &model) override { assert(0); diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index 007e799fe0..ea82a62071 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -609,6 +609,16 @@ void Embedding::backward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +Legion::FutureMap + Embedding::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + // nothing to do (backward function only updates weights) + return FutureMap(); +} + void Embedding::backward_task(Task const *task, std::vector const ®ions, Context ctx, From cdc12e63014ccb644d1c6ebe7c6ffaf5582c7ceb Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 31 Oct 2023 15:52:21 -0400 Subject: [PATCH 054/198] fix download script --- inference/utils/download_hf_model.py | 4 ++-- inference/utils/download_peft_model.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index 03fc8e1633..94a8c23e68 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -36,9 +36,9 @@ def parse_args(): def main(args): if args.full_precision_only: - data_types = ff.DataType.DT_FLOAT + data_types = (ff.DataType.DT_FLOAT,) elif args.half_precision_only: - data_types = ff.DataType.DT_HALF + data_types = (ff.DataType.DT_HALF,) else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index 1204634388..5c7704b6f0 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -36,9 +36,9 @@ def parse_args(): def main(args): if args.full_precision_only: - data_types = ff.DataType.DT_FLOAT + data_types = (ff.DataType.DT_FLOAT,) elif args.half_precision_only: - data_types = ff.DataType.DT_HALF + data_types = (ff.DataType.DT_HALF,) else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) From eb9e2b84c0fc9a629c97e17432ff26d4e08a5203 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 31 Oct 2023 16:54:11 -0400 Subject: [PATCH 055/198] add peft dependencies in dockerfile --- docker/flexflow-environment/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 0e9a3cda82..cae51f1446 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -95,6 +95,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1 RUN conda install pytorch torchvision torchaudio -c pytorch RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook +# PEFT-related +RUN pip3 install scipy bitsandbytes datasets accelerate loralib peft # Install Rust RUN curl https://sh.rustup.rs -sSf | sh -s -- -y From 3dfa14d5a9334a21c224c94785136cd04ffcc2b8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 31 Oct 2023 21:51:20 +0000 Subject: [PATCH 056/198] fix softmax backward --- src/ops/kernels/softmax.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 9ccce40c58..96d50e1ca4 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -317,11 +317,12 @@ void peft_bwd_kernel(SoftmaxMeta const *m, GET_BLOCKS(num_bwd_tokens * num_classes), CUDA_NUM_THREADS, 0, - stream>>>(input_grad_ptr + tokens_previous_requests * num_classes, - output_grad_ptr + tokens_previous_requests * num_classes, - token_ids, - num_bwd_tokens, - num_classes); + stream>>>( + input_grad_ptr + tokens_previous_requests * num_classes, + output_grad_ptr + tokens_previous_requests * num_classes, + static_cast(m->handle.workSpace), + num_bwd_tokens, + num_classes); tokens_previous_requests += num_bwd_tokens; } From 78523e892cd928687c42861ac1ce20d424b9de03 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 1 Nov 2023 04:04:14 +0000 Subject: [PATCH 057/198] fix bc print indentation --- src/runtime/batch_config.cc | 4 ++-- src/runtime/beam_search_batch_config.cc | 4 ++-- src/runtime/tree_verify_batch_config.cc | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 1a6e32e582..f5d69d1992 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -124,8 +124,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; // PEFT values - os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; - os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 82674cce69..bfcf30454c 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -134,8 +134,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; // PEFT values - os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; - os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index ea6e383453..f87500db74 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -55,8 +55,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; // PEFT values - os << "PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; - os << "PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; From bf78ea47c1477c68e359b55938e3b3c74015027d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 2 Nov 2023 19:42:04 +0000 Subject: [PATCH 058/198] Temporarily Revert "Update the default cublas behavior when CUDA_VERSION is not specified" This reverts commit 4ee710a76ee4f47b4574c57519e2b0fb96efaa6a. --- src/ops/inc_multihead_self_attention.cpp | 14 ++++++-------- src/ops/inc_multihead_self_attention.cu | 12 ++++++------ src/ops/kernels/linear_kernels.cpp | 18 ++++++++---------- src/ops/kernels/linear_kernels.cu | 12 ++++++------ src/ops/spec_inc_multihead_self_attention.cpp | 7 +++---- src/ops/spec_inc_multihead_self_attention.cu | 6 +++--- src/ops/tree_inc_multihead_self_attention.cpp | 7 +++---- src/ops/tree_inc_multihead_self_attention.cu | 6 +++--- 8 files changed, 38 insertions(+), 44 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 4495f66844..8acdba7c25 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -257,11 +257,10 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to HIPBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = HIPBLAS_COMPUTE_16F; #else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) @@ -510,11 +509,10 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b83d23804c..7c881bf961 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -238,11 +238,11 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else +#if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads @@ -873,11 +873,11 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else +#if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 2e8761472f..e24f5fe58f 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -274,12 +274,11 @@ void forward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; + hipblasDatatype_t compute_type = input_type; #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, @@ -440,12 +439,11 @@ void backward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; + hipblasDatatype_t compute_type = HIPBLAS_R_32F; #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 4627179fc4..1897f11148 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -365,11 +365,11 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else +#if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = input_type; #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -525,11 +525,11 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else +#if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = CUDA_R_32F; #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 6252693d1a..569dd7f1e5 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -200,11 +200,10 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index e986c4f34d..4338374dca 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -215,11 +215,11 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else +#if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); // int tokens_previous_requests = 0; diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 61117ce6df..e5bec2bc07 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -157,11 +157,10 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; #else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; #endif // int num_requests = bc->num_active_requests(); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 04dc39cfa0..14253e8f61 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -158,11 +158,11 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else +#if CUDA_VERSION >= 11000 // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#else + cudaDataType_t compute_type = cublas_data_type; #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; From b9e7f60b9ca1658fcf608c97b341cc21485a400c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 2 Nov 2023 16:32:33 -0400 Subject: [PATCH 059/198] Fix cublas default (#1220) * Fix Legion prebuild workflow (2) (#1208) * fix * fix * fix * fix * Fix Legion prebuild workflow (3) (#1210) * fix hip error * use CUBLAS_COMPUTE_FAST_16F for full-precision gemm --------- Co-authored-by: Zhihao Jia --- .github/workflows/helpers/prebuild_legion.sh | 2 +- .github/workflows/prebuild-legion.yml | 6 +- CMakeLists.txt | 260 +++++++++---------- config/config.linux | 4 +- src/ops/inc_multihead_self_attention.cu | 26 +- src/ops/kernels/linear_kernels.cpp | 18 +- src/ops/kernels/linear_kernels.cu | 26 +- src/ops/spec_inc_multihead_self_attention.cu | 13 +- src/ops/tree_inc_multihead_self_attention.cu | 13 +- 9 files changed, 200 insertions(+), 168 deletions(-) diff --git a/.github/workflows/helpers/prebuild_legion.sh b/.github/workflows/helpers/prebuild_legion.sh index ccaa58383e..9f5cbe147a 100755 --- a/.github/workflows/helpers/prebuild_legion.sh +++ b/.github/workflows/helpers/prebuild_legion.sh @@ -13,7 +13,7 @@ else echo "Pre-building Legion with GPU backend: ${gpu_backend}" fi -if [[ "${gpu_backend}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then +if [[ "${gpu_backend}" == "cuda" || "${gpu_backend}" == "hip_cuda" ]]; then # Check that CUDA version is supported. Versions above 12.0 not supported because we don't publish docker images for it yet. if [[ "$gpu_backend_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0) ]]; then echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0}" diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml index 1cf0ea2dd8..267daaee6b 100644 --- a/.github/workflows/prebuild-legion.yml +++ b/.github/workflows/prebuild-legion.yml @@ -42,12 +42,12 @@ jobs: - name: Build Legion env: - FF_GPU_BACKEND: ${{ matrix.gpu_backend }} + gpu_backend: ${{ matrix.gpu_backend }} + gpu_backend_version: ${{ matrix.gpu_backend_version }} + python_version: ${{ matrix.python_version }} run: .github/workflows/helpers/prebuild_legion.sh - name: Archive compiled Legion library (CUDA) - env: - FF_GPU_BACKEND: ${{ matrix.gpu_backend }} uses: actions/upload-artifact@v3 with: name: legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 648b46b49e..f9ce66a0f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -504,156 +504,156 @@ if(NOT BUILD_LEGION_ONLY) install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin") endif() endif() -endif() - -if (INFERENCE_TESTS) - target_link_libraries(flexflow "${TORCH_LIBRARIES}") - set_property(TARGET flexflow PROPERTY CXX_STANDARD 14) -endif() - -# build binary -option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON) -option(FF_BUILD_RESNET "build resnet example" OFF) -option(FF_BUILD_RESNEXT "build resnext example" OFF) -option(FF_BUILD_ALEXNET "build alexnet example" OFF) -option(FF_BUILD_DLRM "build DLRM example" OFF) -option(FF_BUILD_XDL "build XDL example" OFF) -option(FF_BUILD_INCEPTION "build inception example" OFF) -option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF) -option(FF_BUILD_TRANSFORMER "build transformer example" OFF) -option(FF_BUILD_MOE "build mixture of experts example" OFF) -option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) -option(FF_BUILD_SPLIT_TEST "build split test example" OFF) -option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) -option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) -option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) -option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) -option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) -option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) -option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) - -if(FF_BUILD_UNIT_TESTS) - set(BUILD_GMOCK OFF) - add_subdirectory(deps/googletest) - enable_testing() - add_subdirectory(tests/unit) -endif() - - if(FF_BUILD_SUBSTITUTION_TOOL) - add_subdirectory(tools/protobuf_to_json) + + if (INFERENCE_TESTS) + target_link_libraries(flexflow "${TORCH_LIBRARIES}") + set_property(TARGET flexflow PROPERTY CXX_STANDARD 14) endif() - if(FF_BUILD_VISUALIZATION_TOOL) - add_subdirectory(tools/substitutions_to_dot) + # build binary + option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" ON) + option(FF_BUILD_RESNET "build resnet example" OFF) + option(FF_BUILD_RESNEXT "build resnext example" OFF) + option(FF_BUILD_ALEXNET "build alexnet example" OFF) + option(FF_BUILD_DLRM "build DLRM example" OFF) + option(FF_BUILD_XDL "build XDL example" OFF) + option(FF_BUILD_INCEPTION "build inception example" OFF) + option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF) + option(FF_BUILD_TRANSFORMER "build transformer example" OFF) + option(FF_BUILD_MOE "build mixture of experts example" OFF) + option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) + option(FF_BUILD_SPLIT_TEST "build split test example" OFF) + option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) + option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) + option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) + option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) + option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) + option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) + option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) + + if(FF_BUILD_UNIT_TESTS) + set(BUILD_GMOCK OFF) + add_subdirectory(deps/googletest) + enable_testing() + add_subdirectory(tests/unit) endif() -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) - if (FF_GPU_BACKEND STREQUAL "hip_rocm") - SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") - endif() - # Ensure Rust is installed - execute_process(COMMAND rustc --version - RESULT_VARIABLE RUST_COMMAND_RESULT - OUTPUT_VARIABLE RUSTC_OUTPUT - ERROR_QUIET) - if(NOT RUST_COMMAND_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + if(FF_BUILD_SUBSTITUTION_TOOL) + add_subdirectory(tools/protobuf_to_json) + endif() + + if(FF_BUILD_VISUALIZATION_TOOL) + add_subdirectory(tools/substitutions_to_dot) + endif() + + if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) + if (FF_GPU_BACKEND STREQUAL "hip_rocm") + SET(SPM_USE_BUILTIN_PROTOBUF OFF CACHE BOOL "Use builtin version of protobuf to compile SentencePiece") + endif() + # Ensure Rust is installed + execute_process(COMMAND rustc --version + RESULT_VARIABLE RUST_COMMAND_RESULT + OUTPUT_VARIABLE RUSTC_OUTPUT + ERROR_QUIET) + if(NOT RUST_COMMAND_RESULT EQUAL 0) + message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + endif() + # Ensure Cargo is installed + execute_process(COMMAND cargo --version + RESULT_VARIABLE CARGO_RESULT + OUTPUT_QUIET ERROR_QUIET) + if(NOT CARGO_RESULT EQUAL 0) + message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + endif() + add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) + target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) + target_link_libraries(flexflow tokenizers_cpp) endif() - # Ensure Cargo is installed - execute_process(COMMAND cargo --version - RESULT_VARIABLE CARGO_RESULT - OUTPUT_QUIET ERROR_QUIET) - if(NOT CARGO_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/ResNet) endif() - add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) - target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) - target_link_libraries(flexflow tokenizers_cpp) -endif() -if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/ResNet) -endif() -if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/resnext50) -endif() + if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/resnext50) + endif() -if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/AlexNet) -endif() + if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/AlexNet) + endif() -if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/MLP_Unify) -endif() + if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/MLP_Unify) + endif() -if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/split_test) -endif() + if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/split_test) + endif() -if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/split_test_2) -endif() + if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/split_test_2) + endif() -if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/InceptionV3) -endif() + if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/InceptionV3) + endif() -#TODO: Once functional add to BUILD_ALL_EXAMPLES -if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/candle_uno) -endif() + #TODO: Once functional add to BUILD_ALL_EXAMPLES + if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/candle_uno) + endif() -if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/DLRM) + if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/DLRM) - #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc) - #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) + #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc) + #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) - #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc) - #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) -endif() + #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc) + #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) + endif() -if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/XDL) -endif() + if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/XDL) + endif() -if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/Transformer) -endif() + if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/Transformer) + endif() -if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(examples/cpp/mixture_of_experts) -endif() + if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(examples/cpp/mixture_of_experts) + endif() -if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) - add_subdirectory(inference/spec_infer) - add_subdirectory(inference/incr_decoding) -endif() + if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + add_subdirectory(inference/spec_infer) + add_subdirectory(inference/incr_decoding) + endif() -# installation -set(INCLUDE_DEST "include") -set(LIB_DEST "lib") -install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST}) -install(TARGETS flexflow DESTINATION ${LIB_DEST}) -# install python -if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) - if (NOT FF_BUILD_FROM_PYPI) - install( - DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ - DESTINATION ${PY_DEST}/flexflow - FILES_MATCHING - PATTERN "*.py") - else() - # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually. - install( - PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py - DESTINATION ${PY_DEST}/flexflow/core - ) - # Use setup.py script to re-install the Python bindings library with the right library paths. - # Need to put the instructions in a subfolder because of issue below: - # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake - add_subdirectory(cmake/pip_install) + # installation + set(INCLUDE_DEST "include") + set(LIB_DEST "lib") + install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST}) + install(TARGETS flexflow DESTINATION ${LIB_DEST}) + # install python + if (FF_USE_PYTHON) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT FF_BUILD_FROM_PYPI) + install( + DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ + DESTINATION ${PY_DEST}/flexflow + FILES_MATCHING + PATTERN "*.py") + else() + # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually. + install( + PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py + DESTINATION ${PY_DEST}/flexflow/core + ) + # Use setup.py script to re-install the Python bindings library with the right library paths. + # Need to put the instructions in a subfolder because of issue below: + # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake + add_subdirectory(cmake/pip_install) + endif() endif() -endif() +endif() # if(NOT BUILD_LEGION_ONLY) diff --git a/config/config.linux b/config/config.linux index 5f15090a02..37b9bd16fd 100755 --- a/config/config.linux +++ b/config/config.linux @@ -10,7 +10,7 @@ #LD_FLAGS=${LD_FLAGS+=""} #set install dir -#INSTALL_DIR= +INSTALL_DIR=${INSTALL_DIR:-} # set build type BUILD_TYPE=${BUILD_TYPE:-Release} @@ -100,7 +100,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 7c881bf961..c406435327 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -238,11 +238,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads @@ -873,11 +878,16 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index e24f5fe58f..2e7ae68314 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -274,11 +274,12 @@ void forward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = output_type; #else - hipblasDatatype_t compute_type = input_type; + // TODO: currently use the output_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = output_type; #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, @@ -439,11 +440,12 @@ void backward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + hipblasDatatype_t compute_type = output_type; #else - hipblasDatatype_t compute_type = HIPBLAS_R_32F; + // TODO: currently use output_type + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + hipblasDatatype_t compute_type = output_type; #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 1897f11148..dad6dc4e00 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -365,11 +365,16 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; #else - cudaDataType_t compute_type = input_type; + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, @@ -525,11 +530,16 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; #else - cudaDataType_t compute_type = CUDA_R_32F; + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 4338374dca..fb96862b81 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -215,11 +215,16 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); // int tokens_previous_requests = 0; diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 14253e8f61..8c2ee24132 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -158,11 +158,16 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; From 463c75770a64c84c27660d60998b626cb88a4f9a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 3 Nov 2023 04:40:18 +0000 Subject: [PATCH 060/198] fix bugs, work on align opt-lora --- inference/incr_decoding/incr_decoding.cc | 20 ++++++++++---------- inference/models/opt.cc | 6 ++++++ src/ops/lora_linear.cc | 6 ++++-- src/runtime/batch_config.cc | 3 ++- src/runtime/beam_search_batch_config.cc | 3 ++- src/runtime/request_manager.cc | 6 ++++-- src/runtime/tree_verify_batch_config.cc | 3 ++- 7 files changed, 30 insertions(+), 17 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 0017fe3fcb..90d1902716 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -284,22 +284,22 @@ void FlexFlow::top_level_task(Task const *task, /*allow_exceptions */ true, /*ignore_comments */ true); std::vector prompts; - std::vector> dataset; + // std::vector> dataset; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; prompts.push_back(text); - dataset.push_back(std::make_pair(text, text)); + // dataset.push_back(std::make_pair(text, text)); } - rm->register_new_peft_request( - dataset, 256 /*max_sequence_length*/, peft_model_id); - for (auto &prompt : prompts) { - GenerationResult result = - model.generate(prompt, 128 /*max_sequence_length*/); - } - // GenerationResult result = - // model.generate(prompts, 128 /*max_sequence_length*/); + // rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/, + // peft_model_id); + // for (auto &prompt : prompts) { + // GenerationResult result = model.generate(prompt, 128 + // /*max_sequence_length*/); + // } + GenerationResult result = + model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id); } // Execution fence diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9b29ae5410..9069aef9e1 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -214,6 +214,12 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers_" + std::to_string(i) + "_fc2").c_str()); + // Low-Rank Adapter (LoRA) for the second linear layer + ff.lora_linear( + activation, + fc2, + OP_LORA_MLP_SECOND, + std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str()); } // final diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 3d2d8d6106..3515a879c9 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -269,6 +269,8 @@ void load_peft_from_file(DT *ptr, size_t size, int shard_id, std::string filepath) { + std::cout << "Loading LORA weight " << filepath << ", size: " << size + << ", shard: " << shard_id << std::endl; std::ifstream in(filepath, std::ios::in | std::ios::binary); if (!in.good()) { printf("Could not open file: %s\n", filepath.c_str()); @@ -443,10 +445,10 @@ void LoraLinear::inference_task(Task const *task, Runtime *runtime) { LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_active_peft_tokens() == 0) { + if (bc->num_active_tokens() == 0) { return; } - assert(regions.size() == 4); + assert(regions.size() == 2); assert(task->regions.size() == regions.size()); assert(m->input_type[0] == m->output_type[0]); diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index f5d69d1992..22ab420674 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -124,7 +124,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; // PEFT values - os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index bfcf30454c..cab8528994 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -134,7 +134,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; // PEFT values - os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 4128fee220..0b89010ab1 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -187,7 +187,6 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_sequence_length = max_sequence_length; request.peft_model_id = peft_model_id; - if (prompt.size() >= get_max_sequence_length()) { std::cout << "Warning: too many tokens in prompt, only load up to " << get_max_sequence_length() << " tokens, but got " @@ -547,6 +546,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == @@ -2093,7 +2095,7 @@ GenerationResult RequestManager::generate_incr_decoding( BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second); FutureMap fm = im->inference(llm, 0, bcf); - im->peft_bwd(llm, 0, bcf); + // im->peft_bwd(llm, 0, bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); batch_pipeline.push(std::make_pair(bcf, irf)); diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index f87500db74..5702bb0a56 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -55,7 +55,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; // PEFT values - os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; From 7c65521e78c4106f80f4632a885d581efee3c8d5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 6 Nov 2023 14:58:20 +0000 Subject: [PATCH 061/198] update scripts --- tests/peft/fine_tune.sh | 4 +-- tests/peft/hf_finetune.py | 2 +- tests/peft/hf_serve.py | 55 +++++++++++++++++++++++++-------------- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh index dbcdb849fa..eddb6139d0 100755 --- a/tests/peft/fine_tune.sh +++ b/tests/peft/fine_tune.sh @@ -7,8 +7,8 @@ cd "${BASH_SOURCE[0]%/*}" python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half -python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full -python hf_finetune.py --model-name JackFram/llama-160m-base --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half +python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full +python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index d702d23038..cf157a8913 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -29,7 +29,7 @@ def print_trainable_parameters(model): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--model-name", type=str, default="decapoda-research/llama-7b-hf") + parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") parser.add_argument("--lora-rank", type=int, default=16) parser.add_argument("--lora-alpha", type=int, default=32) parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target") diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 6f3753906f..efade301da 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -2,51 +2,68 @@ import torch import os, sys from peft import PeftModel, PeftConfig -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + LlamaTokenizer, + GenerationConfig, +) + def main(): parser = argparse.ArgumentParser() parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama") - parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) parser.add_argument("--max-new-tokens", type=int, default=50) + parser.add_argument("--do-sample", action="store_true", help="Use sampling") args = parser.parse_args() peft_model_id = args.peft_model_id - #peft_model_id = "goliaro/llama-7b-lora-half" - use_full_precision=args.use_full_precision + use_full_precision = args.use_full_precision max_new_tokens = args.max_new_tokens # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) - + config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained( - config.base_model_name_or_path, - return_dict=True, - #load_in_8bit=True, - torch_dtype = torch.float32 if use_full_precision else torch.float16, - device_map='auto', + config.base_model_name_or_path, + return_dict=True, + # load_in_8bit=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + device_map="auto", + ) + hf_config = AutoConfig.from_pretrained( + config.base_model_name_or_path, trust_remote_code=True ) - hf_config = AutoConfig.from_pretrained(config.base_model_name_or_path, trust_remote_code=True) hf_arch = getattr(hf_config, "architectures")[0] if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": tokenizer = LlamaTokenizer.from_pretrained( - config.base_model_name_or_path, use_fast=True, - torch_dtype = torch.float32 if use_full_precision else torch.float16, + config.base_model_name_or_path, + use_fast=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, ) else: tokenizer = AutoTokenizer.from_pretrained( - config.base_model_name_or_path, - torch_dtype = torch.float32 if use_full_precision else torch.float16, + config.base_model_name_or_path, + torch_dtype=torch.float32 if use_full_precision else torch.float16, ) - + # Generation config + generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path) + generation_config.do_sample = args.do_sample # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) - batch = tokenizer("Two things are infinite: ", return_tensors='pt') + batch = tokenizer("Two things are infinite: ", return_tensors="pt") with torch.cuda.amp.autocast(): - output_tokens = model.generate(**batch, max_new_tokens=max_new_tokens) - print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) + output_tokens = model.generate( + **batch, max_new_tokens=max_new_tokens, generation_config=generation_config + ) + print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) + if __name__ == "__main__": main() From f4b3f8f56efac7476c7703b4be9b70b5d5bc9857 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 6 Nov 2023 20:30:41 +0000 Subject: [PATCH 062/198] add code to output peft tensors in hf --- .gitignore | 1 + tests/peft/hf_serve.py | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a032f80f77..0579eb5a74 100644 --- a/.gitignore +++ b/.gitignore @@ -187,5 +187,6 @@ gpt_tokenizer python/flexflow/version.txt inference_tensors +hf_peft_tensors Untitled-1.ipynb \ No newline at end of file diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index efade301da..29baf5842b 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -1,6 +1,6 @@ import argparse import torch -import os, sys +import os, sys, shutil from peft import PeftModel, PeftConfig from transformers import ( AutoModelForCausalLM, @@ -10,6 +10,18 @@ GenerationConfig, ) +def peft_pre_forward_hook(module, input): + print("Pre-forward hook activated on module: ", module.name) + #print("Pre-Input: ", input) + torch.save(input, f"./hf_peft_tensors/{module.name}.input") + print("===") + +def peft_post_forward_hook(module, input, output): + print("Post-forward Hook activated for module: ", module.name) + #print("Post-Output: ", output) + torch.save(input, f"./hf_peft_tensors/{module.name}.output") + print("===") + def main(): parser = argparse.ArgumentParser() @@ -19,10 +31,12 @@ def main(): ) parser.add_argument("--max-new-tokens", type=int, default=50) parser.add_argument("--do-sample", action="store_true", help="Use sampling") + parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file") args = parser.parse_args() peft_model_id = args.peft_model_id use_full_precision = args.use_full_precision max_new_tokens = args.max_new_tokens + save_peft_tensors = args.save_peft_tensors # Change working dir to folder storing this script abspath = os.path.abspath(__file__) @@ -57,6 +71,27 @@ def main(): generation_config.do_sample = args.do_sample # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) + + # Register hooks to save tensors, if needed + if save_peft_tensors: + shutil.rmtree("./hf_peft_tensors") + # Check that the output folder exists + os.makedirs("./hf_peft_tensors", exist_ok=True) + # Save weights + for name, params in model.named_parameters(): + if "lora" in name: + print(params, type(params)) + torch.save(params, f"./hf_peft_tensors/{name}") + #params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + # Save hidden states + for name, layer in dict(model.named_modules()).items(): + if "lora_A.default" in name or "lora_B.default" in name: + layer.name = name + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_pre_hook(peft_pre_forward_hook) + layer.register_forward_hook(peft_post_forward_hook) + + batch = tokenizer("Two things are infinite: ", return_tensors="pt") with torch.cuda.amp.autocast(): output_tokens = model.generate( From 9e5fea995d14c8b0c599cf21753ce534594021fa Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 7 Nov 2023 04:17:55 +0000 Subject: [PATCH 063/198] update, fixes --- .../ops/kernels/lora_linear_kernels.h | 2 +- include/flexflow/utils/cuda_helper.h | 7 ++- include/flexflow/utils/hip_helper.h | 7 ++- inference/incr_decoding/incr_decoding.cc | 1 - src/ops/arg_topk.cc | 2 +- src/ops/argmax.cc | 8 ++-- src/ops/beam_topk.cc | 6 +-- src/ops/experts.cu | 16 +++---- src/ops/kernels/lora_linear_kernels.cu | 16 +++++-- src/ops/lora_linear.cc | 47 ++++++++++++++++++- src/ops/sampling.cc | 2 +- src/runtime/cuda_helper.cu | 44 +++++++++++------ src/runtime/hip_helper.cpp | 45 ++++++++++-------- tests/peft/hf_serve.py | 15 ++++-- 14 files changed, 151 insertions(+), 67 deletions(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 32a6832e2e..cf03e518fa 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -12,7 +12,7 @@ namespace FlexFlow { struct LoraLinearWeight { void *w0_ptr, *w1_ptr, *w0_grad_ptr, *w1_grad_ptr; void *w0_state_ptr, *w1_state_ptr; - int rank; + int in_dim, out_dim, rank; }; class LoraLinearMeta : public OpMeta { diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index f8bf67b3e1..983c20525e 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -156,10 +156,13 @@ template void save_tensor(T const *ptr, size_t num_elements, char const *file_name); template -T *download_tensor(T const *ptr, size_t num_elements); +T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements); template -bool download_tensor(T const *ptr, T *dst, size_t num_elements); +void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements); + +template +void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain, diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 5d3c831d4f..b18567e1e7 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -141,10 +141,13 @@ template void save_tensor(T const *ptr, size_t num_elements, char const *file_name); template -T *download_tensor(T const *ptr, size_t num_elements); +T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements); template -bool download_tensor(T const *ptr, T *dst, size_t num_elements); +void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements); + +template +void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements); miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 7c4cef0973..1921e05323 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -158,7 +158,6 @@ void FlexFlow::top_level_task(Task const *task, max_requests_per_batch, max_tokens_per_batch, max_sequence_length); - assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 19b9bff1f6..b937b35b73 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -329,7 +329,7 @@ InferenceResult } InferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index daefaf3b98..e094abbf13 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -354,10 +354,10 @@ BeamInferenceResult ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); BeamInferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); - download_tensor(m->probs, ir.probs, batch_size); - download_tensor(parent.get_int32_ptr(), ir.parent_id, batch_size); + copy_tensor_dev_to_host(m->probs, ir.probs, batch_size); + copy_tensor_dev_to_host(parent.get_int32_ptr(), ir.parent_id, batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -398,7 +398,7 @@ InferenceResult ArgMax::save_inference_tensors_to_file( m, shard_id, bc, {}, {}, {input, indices}); } - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 109937ee0b..d3166af392 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -393,9 +393,9 @@ BeamInferenceResult BeamInferenceResult ir; - download_tensor(index_ptr, ir.token_ids, batch_size * m->max_beam_width); - download_tensor(value_ptr, ir.probs, batch_size * m->max_beam_width); - download_tensor( + copy_tensor_dev_to_host(index_ptr, ir.token_ids, batch_size * m->max_beam_width); + copy_tensor_dev_to_host(value_ptr, ir.probs, batch_size * m->max_beam_width); + copy_tensor_dev_to_host( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); if (m->inference_debugging) { diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 6f0bd8afbb..614d755a35 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -579,14 +579,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, #ifdef INFERENCE_TESTS // Checking // 1. check that m->sorted_indices contains indices sorted - int *indices_cpu = download_tensor(indices, num_indices); + int *indices_cpu = copy_tensor_dev_to_host(indices, num_indices); // assert(indices_cpu != nullptr); std::vector indices_vec(indices_cpu, indices_cpu + num_indices); std::vector indices_vec_sorted(indices_vec.size()); std::copy(indices_vec.begin(), indices_vec.end(), indices_vec_sorted.begin()); std::stable_sort(indices_vec_sorted.begin(), indices_vec_sorted.end()); - int *thrust_sorted_indices_cpu = download_tensor( + int *thrust_sorted_indices_cpu = copy_tensor_dev_to_host( m->sorted_indices, m->num_chosen_experts * m->effective_batch_size); // assert(thrust_sorted_indices_cpu != nullptr); std::vector thrust_sorted_indices_vec( @@ -613,7 +613,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(indices_vec_sorted[i] == thrust_sorted_indices_vec[i]); } // 2. check that indices[m->original_indices[i]] = i - int *thrust_original_indices_cpu = download_tensor( + int *thrust_original_indices_cpu = copy_tensor_dev_to_host( m->original_indices, m->num_chosen_experts * m->effective_batch_size); // assert(thrust_original_indices_cpu != nullptr); std::vector thrust_original_indices_vec( @@ -669,7 +669,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(non_zero_experts_count == non_zero_experts_check.size()); // 7. check exp_local_label_to_index int *non_zero_expert_labels_cpu = - download_tensor(m->non_zero_expert_labels, non_zero_experts_count); + copy_tensor_dev_to_host(m->non_zero_expert_labels, non_zero_experts_count); // assert(non_zero_expert_labels_cpu != nullptr); std::vector non_zero_expert_labels_vec(non_zero_expert_labels_cpu, non_zero_expert_labels_cpu + @@ -685,7 +685,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(non_zero_expert_labels_vec == non_zero_experts_check_vec); int *exp_local_label_to_index = - download_tensor(m->exp_local_label_to_index, non_zero_experts_count); + copy_tensor_dev_to_host(m->exp_local_label_to_index, non_zero_experts_count); // assert(exp_local_label_to_index != nullptr); std::vector exp_local_label_to_index_vec(exp_local_label_to_index, exp_local_label_to_index + @@ -700,7 +700,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, // 8. Check expert_start_indexes int *expert_start_indices_thrust = - download_tensor(m->expert_start_indexes, non_zero_experts_count + 1); + copy_tensor_dev_to_host(m->expert_start_indexes, non_zero_experts_count + 1); // assert(expert_start_indices_thrust != nullptr); std::vector expert_start_indices_thrust_vec( expert_start_indices_thrust, @@ -746,7 +746,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int *num_assignments_per_expert_thrust = (int *)calloc(non_zero_experts_count, sizeof(int)); assert(num_assignments_per_expert_thrust != nullptr); - assert(download_tensor(m->num_assignments_per_expert, + assert(copy_tensor_dev_to_host(m->num_assignments_per_expert, num_assignments_per_expert_thrust, non_zero_experts_count)); assert(num_assignments_per_expert_thrust != nullptr); @@ -759,7 +759,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int *destination_start_indices_thrust = (int *)calloc(non_zero_experts_count, sizeof(int)); assert(destination_start_indices_thrust != nullptr); - assert(download_tensor(m->destination_start_indices, + assert(copy_tensor_dev_to_host(m->destination_start_indices, destination_start_indices_thrust, non_zero_experts_count)); assert(destination_start_indices_thrust != nullptr); diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 50b6884a5b..2d271efe72 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -148,8 +148,13 @@ void inference_kernel(LoraLinearMeta *m, #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = output_type; #else - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->input_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif int num_peft_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -263,10 +268,15 @@ void peft_bwd_kernel(LoraLinearMeta *m, cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; + cudaDataType_t compute_type = output_type; #else - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 3515a879c9..47d793446d 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -6,6 +6,8 @@ #include "flexflow/utils/hash_utils.h" #include "flexflow/utils/peft_weight_allocator.h" #include "legion/legion_utilities.h" +#include +#include #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "flexflow/utils/cuda_helper.h" #else @@ -215,6 +217,7 @@ OpMeta *LoraLinear::init_task(Task const *task, LoraLinearMeta *m = new LoraLinearMeta(handle, lora); m->trainable_inputs[0] = lora->trainable_inputs[0]; std::strcpy(m->op_name, lora->name); + m->layer_guid = lora->layer_guid; return m; } @@ -290,7 +293,7 @@ void load_peft_from_file(DT *ptr, assert(false); } assert(size == host_array.size()); - copy_kernel(ptr, host_array.data(), target_data_size); + copy_tensor_host_to_dev(ptr, host_array.data(), size); in.close(); } @@ -321,6 +324,9 @@ void LoraLinear::register_model_task(Task const *task, assert(m->model_weights.find(info->model_id) == m->model_weights.end()); LoraLinearWeight weight; + weight.in_dim = in_dim; + weight.out_dim = out_dim; + weight.rank = rank; PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; weight.w0_ptr = allocator->allocate_local_weights_untyped( info->model_id, w0_num_elements * data_type_size(dt)); @@ -367,7 +373,6 @@ void LoraLinear::register_model_task(Task const *task, assert(false && "Data type not supported"); } - weight.rank = rank; if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { // Input is partitioned (no replication) // w0_grad is local weight gradients @@ -462,6 +467,44 @@ void LoraLinear::inference_task(Task const *task, // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); inference_kernel_wrapper(m, bc, input, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + + // Check if output directory exists, and create it if it does not + char const *folder_path = "./inference_tensors"; + struct stat st = {0}; + if (stat(folder_path, &st) == -1) { + // Directory does not exist, create it + mkdir(folder_path, 0700); + } + // output base filepath, shared by all tensors from the same operator + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + + m->op_name + "_shard-id_" + std::to_string(shard_id); + std::cout << "base_filepath: " << base_filepath << std::endl; + std::cout << "m->decoding_step: " << m->decoding_step << std::endl; + if (m->decoding_step == 0) { + for (auto it = m->model_weights.begin(); it != m->model_weights.end(); ++it) { + PEFTModelID peft_model_id = it->first; + LoraLinearWeight weight = m->model_weights[peft_model_id]; + std::string filenameA = base_filepath + "_weight_A"; + std::string filenameB = base_filepath + "_weight_B"; + if (m->input_type[0] == DT_FLOAT) { + save_tensor((float*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str()); + save_tensor((float*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str()); + } else if (m->input_type[0] == DT_HALF) { + save_tensor((half*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str()); + save_tensor((half*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str()); + } else { + assert(false && "Data type not supported"); + } + } + } + LoraLinear::save_inference_tensors_to_file(m, shard_id, bc, {input}, {}, {output}); + } } FutureMap LoraLinear::peft_bwd(FFModel const &ff, diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index e98c7f0ec3..4cec9a50b7 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -313,7 +313,7 @@ InferenceResult } InferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index fa6bf55fe5..74575ea6ba 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -375,7 +375,7 @@ __host__ void save_tensor(int64_t const *ptr, } template -__host__ T *download_tensor(T const *ptr, size_t num_elements) { +__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); T *host_ptr; @@ -388,14 +388,23 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) { } template -__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { +__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); checkCUDA(cudaMemcpyAsync( dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); - return true; } + +template +__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(src != nullptr); + checkCUDA(cudaMemcpyAsync( + dst, src, sizeof(T) * num_elements, cudaMemcpyHostToDevice, stream)); +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { int dims[MAX_TENSOR_DIM]; @@ -700,26 +709,31 @@ template __host__ void save_tensor(int64_t const *ptr, template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); -template __host__ float *download_tensor(float const *ptr, +template __host__ float *copy_tensor_dev_to_host(float const *ptr, size_t num_elements); -template __host__ half *download_tensor(half const *ptr, +template __host__ half *copy_tensor_dev_to_host(half const *ptr, size_t num_elements); -template __host__ double *download_tensor(double const *ptr, +template __host__ double *copy_tensor_dev_to_host(double const *ptr, size_t num_elements); -template __host__ int32_t *download_tensor(int32_t const *ptr, +template __host__ int32_t *copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); -template __host__ int64_t *download_tensor(int64_t const *ptr, +template __host__ int64_t *copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); -template __host__ bool - download_tensor(float const *ptr, float *dst, size_t num_elements); -template __host__ bool - download_tensor(half const *ptr, half *dst, size_t num_elements); -template __host__ bool download_tensor(double const *ptr, +template __host__ void + copy_tensor_dev_to_host(float const *ptr, float *dst, size_t num_elements); +template __host__ void + copy_tensor_dev_to_host(half const *ptr, half *dst, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(double const *ptr, double *dst, size_t num_elements); -template __host__ bool download_tensor(int32_t const *ptr, +template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, int32_t *dst, size_t num_elements); -template __host__ bool download_tensor(int64_t const *ptr, +template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, int64_t *dst, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, float const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, half const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, double const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, int32_t const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, int64_t const *src, size_t num_elements); diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index fb94135c8f..ac0e7c157f 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -354,9 +354,7 @@ __host__ void save_tensor(int64_t const *ptr, } template -__host__ T *download_tensor(T const *ptr, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); T *host_ptr; @@ -365,21 +363,25 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) { hipHostMallocPortable | hipHostMallocMapped)); checkCUDA(hipMemcpyAsync( host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); - // checkCUDA(hipDeviceSynchronize()); return host_ptr; } template -__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); checkCUDA(hipMemcpyAsync( dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); - // checkCUDA(hipDeviceSynchronize()); - return true; +} + +template +__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(src != nullptr); + checkCUDA(hipMemcpyAsync( + dst, src, sizeof(T) * num_elements, hipMemcpyHostToDevice, stream)); } miopenStatus_t cudnnSetTensorDescriptorFromDomain( @@ -610,24 +612,29 @@ template __host__ void save_tensor(int64_t const *ptr, template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); -template __host__ float *download_tensor(float const *ptr, +template __host__ float *copy_tensor_dev_to_host(float const *ptr, size_t num_elements); -template __host__ half *download_tensor(half const *ptr, +template __host__ half *copy_tensor_dev_to_host(half const *ptr, size_t num_elements); -template __host__ double *download_tensor(double const *ptr, +template __host__ double *copy_tensor_dev_to_host(double const *ptr, size_t num_elements); -template __host__ int32_t *download_tensor(int32_t const *ptr, +template __host__ int32_t *copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); -template __host__ int64_t *download_tensor(int64_t const *ptr, +template __host__ int64_t *copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); -template __host__ bool - download_tensor(float const *ptr, float *dst, size_t num_elements); -template __host__ bool download_tensor(double const *ptr, +template __host__ void + copy_tensor_dev_to_host(float const *ptr, float *dst, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(double const *ptr, double *dst, size_t num_elements); -template __host__ bool download_tensor(int32_t const *ptr, +template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, int32_t *dst, size_t num_elements); -template __host__ bool download_tensor(int64_t const *ptr, +template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, int64_t *dst, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, float const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, half const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, double const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, int32_t const *src, size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, int64_t const *src, size_t num_elements); \ No newline at end of file diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 29baf5842b..0af515e6a9 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -11,16 +11,21 @@ ) def peft_pre_forward_hook(module, input): - print("Pre-forward hook activated on module: ", module.name) + assert(module.name is not None and module.decoding_step is not None) + name = module.name.replace("base_model.model.model", "") + print(f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}") #print("Pre-Input: ", input) - torch.save(input, f"./hf_peft_tensors/{module.name}.input") + torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input") print("===") def peft_post_forward_hook(module, input, output): - print("Post-forward Hook activated for module: ", module.name) + assert(module.name is not None and module.decoding_step is not None) + name = module.name.replace("base_model.model.model", "") + print(f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}") #print("Post-Output: ", output) - torch.save(input, f"./hf_peft_tensors/{module.name}.output") + torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output") print("===") + module.decoding_step += 1 def main(): @@ -80,13 +85,13 @@ def main(): # Save weights for name, params in model.named_parameters(): if "lora" in name: - print(params, type(params)) torch.save(params, f"./hf_peft_tensors/{name}") #params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") # Save hidden states for name, layer in dict(model.named_modules()).items(): if "lora_A.default" in name or "lora_B.default" in name: layer.name = name + layer.decoding_step = 0 print(f"Adding hooks to layer {layer.name}") layer.register_forward_pre_hook(peft_pre_forward_hook) layer.register_forward_hook(peft_post_forward_hook) From 62edfaa92a41093ee6d7cdbd6e2f2dd4b7799f38 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 7 Nov 2023 04:18:09 +0000 Subject: [PATCH 064/198] linting --- src/ops/argmax.cc | 3 +- src/ops/beam_topk.cc | 6 ++-- src/ops/experts.cu | 20 ++++++------ src/ops/lora_linear.cc | 22 ++++++++++---- src/runtime/cuda_helper.cu | 62 +++++++++++++++++++++++--------------- src/runtime/hip_helper.cpp | 57 +++++++++++++++++++++-------------- 6 files changed, 105 insertions(+), 65 deletions(-) diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index e094abbf13..cabb8b204f 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -357,7 +357,8 @@ BeamInferenceResult copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); copy_tensor_dev_to_host(m->probs, ir.probs, batch_size); - copy_tensor_dev_to_host(parent.get_int32_ptr(), ir.parent_id, batch_size); + copy_tensor_dev_to_host( + parent.get_int32_ptr(), ir.parent_id, batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index d3166af392..a660a80301 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -393,8 +393,10 @@ BeamInferenceResult BeamInferenceResult ir; - copy_tensor_dev_to_host(index_ptr, ir.token_ids, batch_size * m->max_beam_width); - copy_tensor_dev_to_host(value_ptr, ir.probs, batch_size * m->max_beam_width); + copy_tensor_dev_to_host( + index_ptr, ir.token_ids, batch_size * m->max_beam_width); + copy_tensor_dev_to_host( + value_ptr, ir.probs, batch_size * m->max_beam_width); copy_tensor_dev_to_host( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); diff --git a/src/ops/experts.cu b/src/ops/experts.cu index 614d755a35..f6f555d1ad 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -668,8 +668,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } assert(non_zero_experts_count == non_zero_experts_check.size()); // 7. check exp_local_label_to_index - int *non_zero_expert_labels_cpu = - copy_tensor_dev_to_host(m->non_zero_expert_labels, non_zero_experts_count); + int *non_zero_expert_labels_cpu = copy_tensor_dev_to_host( + m->non_zero_expert_labels, non_zero_experts_count); // assert(non_zero_expert_labels_cpu != nullptr); std::vector non_zero_expert_labels_vec(non_zero_expert_labels_cpu, non_zero_expert_labels_cpu + @@ -684,8 +684,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, non_zero_experts_check_vec.end())); assert(non_zero_expert_labels_vec == non_zero_experts_check_vec); - int *exp_local_label_to_index = - copy_tensor_dev_to_host(m->exp_local_label_to_index, non_zero_experts_count); + int *exp_local_label_to_index = copy_tensor_dev_to_host( + m->exp_local_label_to_index, non_zero_experts_count); // assert(exp_local_label_to_index != nullptr); std::vector exp_local_label_to_index_vec(exp_local_label_to_index, exp_local_label_to_index + @@ -699,8 +699,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } // 8. Check expert_start_indexes - int *expert_start_indices_thrust = - copy_tensor_dev_to_host(m->expert_start_indexes, non_zero_experts_count + 1); + int *expert_start_indices_thrust = copy_tensor_dev_to_host( + m->expert_start_indexes, non_zero_experts_count + 1); // assert(expert_start_indices_thrust != nullptr); std::vector expert_start_indices_thrust_vec( expert_start_indices_thrust, @@ -747,8 +747,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, (int *)calloc(non_zero_experts_count, sizeof(int)); assert(num_assignments_per_expert_thrust != nullptr); assert(copy_tensor_dev_to_host(m->num_assignments_per_expert, - num_assignments_per_expert_thrust, - non_zero_experts_count)); + num_assignments_per_expert_thrust, + non_zero_experts_count)); assert(num_assignments_per_expert_thrust != nullptr); std::vector num_assignments_per_expert_thrust_vec( num_assignments_per_expert_thrust, @@ -760,8 +760,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, (int *)calloc(non_zero_experts_count, sizeof(int)); assert(destination_start_indices_thrust != nullptr); assert(copy_tensor_dev_to_host(m->destination_start_indices, - destination_start_indices_thrust, - non_zero_experts_count)); + destination_start_indices_thrust, + non_zero_experts_count)); assert(destination_start_indices_thrust != nullptr); std::vector destination_start_indices_thrust_vec( destination_start_indices_thrust, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 47d793446d..8115026f02 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -487,23 +487,33 @@ void LoraLinear::inference_task(Task const *task, std::cout << "base_filepath: " << base_filepath << std::endl; std::cout << "m->decoding_step: " << m->decoding_step << std::endl; if (m->decoding_step == 0) { - for (auto it = m->model_weights.begin(); it != m->model_weights.end(); ++it) { + for (auto it = m->model_weights.begin(); it != m->model_weights.end(); + ++it) { PEFTModelID peft_model_id = it->first; LoraLinearWeight weight = m->model_weights[peft_model_id]; std::string filenameA = base_filepath + "_weight_A"; std::string filenameB = base_filepath + "_weight_B"; if (m->input_type[0] == DT_FLOAT) { - save_tensor((float*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str()); - save_tensor((float*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str()); + save_tensor((float *)weight.w0_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((float *)weight.w1_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); } else if (m->input_type[0] == DT_HALF) { - save_tensor((half*)weight.w0_ptr, weight.rank * weight.in_dim, filenameA.c_str()); - save_tensor((half*)weight.w1_ptr, weight.rank * weight.out_dim, filenameB.c_str()); + save_tensor((half *)weight.w0_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((half *)weight.w1_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); } else { assert(false && "Data type not supported"); } } } - LoraLinear::save_inference_tensors_to_file(m, shard_id, bc, {input}, {}, {output}); + LoraLinear::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {output}); } } diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 74575ea6ba..58d3dc8012 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -388,7 +388,8 @@ __host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { } template -__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { +__host__ void + copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); @@ -397,7 +398,8 @@ __host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) } template -__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { +__host__ void + copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(src != nullptr); @@ -710,30 +712,42 @@ template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); template __host__ float *copy_tensor_dev_to_host(float const *ptr, - size_t num_elements); + size_t num_elements); template __host__ half *copy_tensor_dev_to_host(half const *ptr, - size_t num_elements); + size_t num_elements); template __host__ double *copy_tensor_dev_to_host(double const *ptr, - size_t num_elements); -template __host__ int32_t *copy_tensor_dev_to_host(int32_t const *ptr, - size_t num_elements); -template __host__ int64_t *copy_tensor_dev_to_host(int64_t const *ptr, - size_t num_elements); -template __host__ void - copy_tensor_dev_to_host(float const *ptr, float *dst, size_t num_elements); -template __host__ void - copy_tensor_dev_to_host(half const *ptr, half *dst, size_t num_elements); + size_t num_elements); +template __host__ int32_t * + copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); +template __host__ int64_t * + copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(float const *ptr, + float *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(half const *ptr, + half *dst, + size_t num_elements); template __host__ void copy_tensor_dev_to_host(double const *ptr, - double *dst, - size_t num_elements); + double *dst, + size_t num_elements); template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, - int32_t *dst, - size_t num_elements); + int32_t *dst, + size_t num_elements); template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, - int64_t *dst, - size_t num_elements); -template __host__ void copy_tensor_host_to_dev(float *dst, float const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(half *dst, half const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(double *dst, double const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(int32_t *dst, int32_t const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(int64_t *dst, int64_t const *src, size_t num_elements); + int64_t *dst, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, + float const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, + half const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, + double const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, + int32_t const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, + int64_t const *src, + size_t num_elements); diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index ac0e7c157f..5ab86deaab 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -367,7 +367,8 @@ __host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { } template -__host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { +__host__ void + copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); @@ -376,7 +377,8 @@ __host__ void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) } template -__host__ void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { +__host__ void + copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(src != nullptr); @@ -613,28 +615,39 @@ template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); template __host__ float *copy_tensor_dev_to_host(float const *ptr, - size_t num_elements); + size_t num_elements); template __host__ half *copy_tensor_dev_to_host(half const *ptr, - size_t num_elements); + size_t num_elements); template __host__ double *copy_tensor_dev_to_host(double const *ptr, - size_t num_elements); -template __host__ int32_t *copy_tensor_dev_to_host(int32_t const *ptr, - size_t num_elements); -template __host__ int64_t *copy_tensor_dev_to_host(int64_t const *ptr, - size_t num_elements); -template __host__ void - copy_tensor_dev_to_host(float const *ptr, float *dst, size_t num_elements); + size_t num_elements); +template __host__ int32_t * + copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); +template __host__ int64_t * + copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(float const *ptr, + float *dst, + size_t num_elements); template __host__ void copy_tensor_dev_to_host(double const *ptr, - double *dst, - size_t num_elements); + double *dst, + size_t num_elements); template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, - int32_t *dst, - size_t num_elements); + int32_t *dst, + size_t num_elements); template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, - int64_t *dst, - size_t num_elements); -template __host__ void copy_tensor_host_to_dev(float *dst, float const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(half *dst, half const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(double *dst, double const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(int32_t *dst, int32_t const *src, size_t num_elements); -template __host__ void copy_tensor_host_to_dev(int64_t *dst, int64_t const *src, size_t num_elements); \ No newline at end of file + int64_t *dst, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, + float const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, + half const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, + double const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, + int32_t const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, + int64_t const *src, + size_t num_elements); \ No newline at end of file From ddb5c2928608e5b489d666747b340872c7bd582e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 7 Nov 2023 23:32:36 +0000 Subject: [PATCH 065/198] fix printing of tensors for numpy --- src/runtime/cuda_helper.cu | 6 +++++- src/runtime/hip_helper.cpp | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 58d3dc8012..c2b2affc40 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -287,7 +287,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", host_ptr[i]); + } } fclose(tensor_file); diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 5ab86deaab..0ffc1a895d 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -266,7 +266,11 @@ __host__ void tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { - fprintf(tensor_file, "%.9f, ", host_ptr[i]); + if (i < num_elements - 1) { + fprintf(tensor_file, "%.9f, ", host_ptr[i]); + } else { + fprintf(tensor_file, "%.9f", host_ptr[i]); + } } fclose(tensor_file); From d276496f705e53c89f90c638ad4cc24cd03dcf53 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 8 Nov 2023 02:16:47 +0000 Subject: [PATCH 066/198] update save_inference_tensors_to_file --- src/ops/lora_linear.cc | 53 ++++++++++++++++++++++++++++++++++++----- src/runtime/operator.cc | 3 ++- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 8115026f02..ffd5f6a958 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -467,6 +467,7 @@ void LoraLinear::inference_task(Task const *task, // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); inference_kernel_wrapper(m, bc, input, output); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -478,17 +479,47 @@ void LoraLinear::inference_task(Task const *task, // Directory does not exist, create it mkdir(folder_path, 0700); } + + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + // output base filepath, shared by all tensors from the same operator std::string base_filepath = "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - m->op_name + "_shard-id_" + std::to_string(shard_id); - std::cout << "base_filepath: " << base_filepath << std::endl; - std::cout << "m->decoding_step: " << m->decoding_step << std::endl; + lora_layername_substr + "_shard-id_" + std::to_string(shard_id); + + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(base_filepath + "_batch-config"); + } + + std::string filename = base_filepath + "_input_" + std::to_string(0); + if (input.data_type == DT_FLOAT) { + save_tensor( + input.get_float_ptr(), input.domain.get_volume(), filename.c_str()); + } else if (input.data_type == DT_HALF) { + save_tensor( + input.get_half_ptr(), input.domain.get_volume(), filename.c_str()); + } else { + assert(false); + } + + // std::cout << "base_filepath: " << base_filepath << std::endl; + // std::cout << "m->decoding_step: " << m->decoding_step << std::endl; if (m->decoding_step == 0) { for (auto it = m->model_weights.begin(); it != m->model_weights.end(); - ++it) { + ++it) { PEFTModelID peft_model_id = it->first; LoraLinearWeight weight = m->model_weights[peft_model_id]; std::string filenameA = base_filepath + "_weight_A"; @@ -512,8 +543,18 @@ void LoraLinear::inference_task(Task const *task, } } } - LoraLinear::save_inference_tensors_to_file( - m, shard_id, bc, {input}, {}, {output}); + + filename = base_filepath + "_output_" + std::to_string(0); + if (output.data_type == DT_FLOAT) { + save_tensor( + output.get_float_ptr(), output.domain.get_volume(), filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor( + output.get_half_ptr(), output.domain.get_volume(), filename.c_str()); + } else { + assert(false); + } + m->decoding_step++; } } diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 0b3813f41c..c60fa08814 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -26,8 +26,9 @@ size_t Op::get_params_hash() const { } /*static*/ +template void Op::save_inference_tensors_to_file( - OpMeta *m, + OpMetaType *m, int shard_id, BatchConfig const *bc, std::vector input_tensors, From bc79d3b536cfcb6de0ac5f6dbfacb10492b9d3de Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 8 Nov 2023 02:17:16 +0000 Subject: [PATCH 067/198] linting --- src/ops/lora_linear.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index ffd5f6a958..4054173c2f 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -485,7 +485,7 @@ void LoraLinear::inference_task(Task const *task, size_t found = lora_layername.find(searchString); if (found == std::string::npos) { std::cout << "LoraLinear layer name not in the right format (does not " - "contain word 'lora')" + "contain word 'lora')" << std::endl; assert(false); } @@ -519,7 +519,7 @@ void LoraLinear::inference_task(Task const *task, // std::cout << "m->decoding_step: " << m->decoding_step << std::endl; if (m->decoding_step == 0) { for (auto it = m->model_weights.begin(); it != m->model_weights.end(); - ++it) { + ++it) { PEFTModelID peft_model_id = it->first; LoraLinearWeight weight = m->model_weights[peft_model_id]; std::string filenameA = base_filepath + "_weight_A"; From 8e34632c94924e0db444cc7a3bbe53fe0a38434d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 8 Nov 2023 02:17:22 +0000 Subject: [PATCH 068/198] update --- tests/peft/hf_serve.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 0af515e6a9..6e143550c8 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -12,18 +12,18 @@ def peft_pre_forward_hook(module, input): assert(module.name is not None and module.decoding_step is not None) - name = module.name.replace("base_model.model.model", "") + name = module.name.replace("base_model.model.model.", "") print(f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}") - #print("Pre-Input: ", input) + print("Pre-Input: ", input[0].shape) torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input") - print("===") + #print("===") def peft_post_forward_hook(module, input, output): assert(module.name is not None and module.decoding_step is not None) - name = module.name.replace("base_model.model.model", "") + name = module.name.replace("base_model.model.model.", "") print(f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}") - #print("Post-Output: ", output) - torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output") + print("Post-Input/Output: ", input[0].shape, output[0].shape) + torch.save(output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output") print("===") module.decoding_step += 1 From b11c5e9d81bfbc84073443ac69eb0376c1aad7c8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 8 Nov 2023 02:37:41 +0000 Subject: [PATCH 069/198] fix issue with save_inference_tensors_to_file --- include/flexflow/operator.h | 104 +++++++++++++++++++++++++++++++++- src/runtime/operator.cc | 110 ------------------------------------ 2 files changed, 102 insertions(+), 112 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index b827148a3a..df796a7879 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -9,6 +9,14 @@ #include "flexflow/utils/dot/record_formatter.h" #include +#include +#include +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + namespace FlexFlow { extern LegionRuntime::Logger::Category log_measure; @@ -234,13 +242,105 @@ class Op { assert(false); } virtual void print_layer(FFModel const &model) = 0; + template static void save_inference_tensors_to_file( - OpMeta *m, + OpMetaType *m, int shard_id, BatchConfig const *bc, std::vector input_tensors, std::vector weight_tensors, - std::vector output_tensors); + std::vector output_tensors) { + // Check if output directory exists, and create it if it does not + char const *folder_path = "./inference_tensors"; + struct stat st = {0}; + if (stat(folder_path, &st) == -1) { + // Directory does not exist, create it + mkdir(folder_path, 0700); + } + // output base filepath, shared by all tensors from the same operator + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + + m->op_name + "_shard-id_" + std::to_string(shard_id); + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(base_filepath + "_batch-config"); + } + // save all inputs + for (int i = 0; i < input_tensors.size(); i++) { + std::string filename = base_filepath + "_input_" + std::to_string(i); + if (input_tensors[i].data_type == DT_FLOAT) { + save_tensor(input_tensors[i].get_float_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_HALF) { + save_tensor(input_tensors[i].get_half_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT32) { + save_tensor(input_tensors[i].get_int32_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (input_tensors[i].data_type == DT_INT64) { + save_tensor(input_tensors[i].get_int64_ptr(), + input_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // only dump the weights once + if (m->decoding_step == 0) { + for (int i = 0; i < weight_tensors.size(); i++) { + std::string filename = base_filepath + "_weight_" + std::to_string(i); + if (weight_tensors[i].data_type == DT_FLOAT) { + save_tensor(weight_tensors[i].get_float_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_HALF) { + save_tensor(weight_tensors[i].get_half_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT32) { + save_tensor(weight_tensors[i].get_int32_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (weight_tensors[i].data_type == DT_INT64) { + save_tensor(weight_tensors[i].get_int64_ptr(), + weight_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + } + // save all outputs + for (int i = 0; i < output_tensors.size(); i++) { + std::string filename = base_filepath + "_output_" + std::to_string(i); + if (output_tensors[i].data_type == DT_FLOAT) { + save_tensor(output_tensors[i].get_float_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_HALF) { + save_tensor(output_tensors[i].get_half_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT32) { + save_tensor(output_tensors[i].get_int32_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else if (output_tensors[i].data_type == DT_INT64) { + save_tensor(output_tensors[i].get_int64_ptr(), + output_tensors[i].domain.get_volume(), + filename.c_str()); + } else { + assert(false && "Tensor data type not supported"); + } + } + // increase count of decoding steps + m->decoding_step++; + } virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const = 0; diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index c60fa08814..08b1af8ca5 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -3,14 +3,6 @@ #include "flexflow/simulator.h" #include -#include -#include -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "flexflow/utils/cuda_helper.h" -#else -#include "flexflow/utils/hip_helper.h" -#endif - namespace FlexFlow { size_t Op::get_untyped_params_hash() const { @@ -25,106 +17,4 @@ size_t Op::get_params_hash() const { get_operator_type_name(this->op_type)); } -/*static*/ -template -void Op::save_inference_tensors_to_file( - OpMetaType *m, - int shard_id, - BatchConfig const *bc, - std::vector input_tensors, - std::vector weight_tensors, - std::vector output_tensors) { - - // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; - struct stat st = {0}; - if (stat(folder_path, &st) == -1) { - // Directory does not exist, create it - mkdir(folder_path, 0700); - } - // output base filepath, shared by all tensors from the same operator - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - m->op_name + "_shard-id_" + std::to_string(shard_id); - // save batch config, if passed - if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); - } - // save all inputs - for (int i = 0; i < input_tensors.size(); i++) { - std::string filename = base_filepath + "_input_" + std::to_string(i); - if (input_tensors[i].data_type == DT_FLOAT) { - save_tensor(input_tensors[i].get_float_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_HALF) { - save_tensor(input_tensors[i].get_half_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_INT32) { - save_tensor(input_tensors[i].get_int32_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (input_tensors[i].data_type == DT_INT64) { - save_tensor(input_tensors[i].get_int64_ptr(), - input_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - // only dump the weights once - if (m->decoding_step == 0) { - for (int i = 0; i < weight_tensors.size(); i++) { - std::string filename = base_filepath + "_weight_" + std::to_string(i); - if (weight_tensors[i].data_type == DT_FLOAT) { - save_tensor(weight_tensors[i].get_float_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_HALF) { - save_tensor(weight_tensors[i].get_half_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_INT32) { - save_tensor(weight_tensors[i].get_int32_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (weight_tensors[i].data_type == DT_INT64) { - save_tensor(weight_tensors[i].get_int64_ptr(), - weight_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - } - // save all outputs - for (int i = 0; i < output_tensors.size(); i++) { - std::string filename = base_filepath + "_output_" + std::to_string(i); - if (output_tensors[i].data_type == DT_FLOAT) { - save_tensor(output_tensors[i].get_float_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_HALF) { - save_tensor(output_tensors[i].get_half_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_INT32) { - save_tensor(output_tensors[i].get_int32_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else if (output_tensors[i].data_type == DT_INT64) { - save_tensor(output_tensors[i].get_int64_ptr(), - output_tensors[i].domain.get_volume(), - filename.c_str()); - } else { - assert(false && "Tensor data type not supported"); - } - } - // increase count of decoding steps - m->decoding_step++; -} - }; // namespace FlexFlow \ No newline at end of file From fca16ccf1b446fe89788f24ba0f36c6011891055 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 8 Nov 2023 05:02:32 +0000 Subject: [PATCH 070/198] fix layer names for save_inference_tensors_to_file --- include/flexflow/operator.h | 12 +++++++- .../ops/add_bias_residual_layer_norm_params.h | 1 + include/flexflow/ops/embedding_params.h | 1 + .../ops/inc_multihead_self_attention_params.h | 1 + include/flexflow/ops/linear_params.h | 1 + .../flexflow/ops/residual_layer_norm_params.h | 1 + .../flexflow/ops/residual_rms_norm_params.h | 1 + include/flexflow/ops/rms_norm_params.h | 1 + .../flexflow/ops/sigmoid_silu_multi_params.h | 1 + ...spec_inc_multihead_self_attention_params.h | 1 + ...tree_inc_multihead_self_attention_params.h | 1 + inference/incr_decoding/incr_decoding.cc | 8 ++++-- src/ops/add_bias_residual_layer_norm.cc | 12 +++++++- src/ops/inc_multihead_self_attention.cc | 5 +++- src/ops/linear.cc | 12 +++++++- src/ops/lora_linear.cc | 2 +- src/ops/residual_layer_norm.cc | 12 +++++++- src/ops/residual_rms_norm.cc | 12 +++++++- src/ops/rms_norm.cc | 13 ++++++++- src/ops/sigmoid_silu_multi.cc | 12 +++++++- src/ops/spec_inc_multihead_self_attention.cc | 5 +++- src/ops/tree_inc_multihead_self_attention.cc | 5 +++- src/runtime/graph.cc | 28 +++++++++++++++++++ 23 files changed, 135 insertions(+), 13 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index df796a7879..388f9dcd6a 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -258,11 +258,21 @@ class Op { mkdir(folder_path, 0700); } // output base filepath, shared by all tensors from the same operator + std::string op_name_without_uid = std::string(m->op_name); + size_t last_underscore = op_name_without_uid.length() - 1; + for (int i = op_name_without_uid.length() - 1; i > 0; i--) { + if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { + break; + } else if (m->op_name[i] == '_') { + last_underscore = i; + } + } + op_name_without_uid.erase(last_underscore); std::string base_filepath = "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - m->op_name + "_shard-id_" + std::to_string(shard_id); + op_name_without_uid + "_shard-id_" + std::to_string(shard_id); // save batch config, if passed if (bc != nullptr) { bc->save_to_file(base_filepath + "_batch-config"); diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h index 6f49983467..87fe2fb562 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h @@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams { bool elementwise_affine; float eps; bool use_bias; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/embedding_params.h b/include/flexflow/ops/embedding_params.h index 71e5cc8b20..d813132048 100644 --- a/include/flexflow/ops/embedding_params.h +++ b/include/flexflow/ops/embedding_params.h @@ -12,6 +12,7 @@ struct EmbeddingParams { LayerID layer_guid; AggrMode aggr; DataType data_type; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 7ae39f1cfe..58681069e2 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -16,6 +16,7 @@ struct IncMultiHeadSelfAttentionParams { scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/linear_params.h b/include/flexflow/ops/linear_params.h index 563304e89f..9a62ebd857 100644 --- a/include/flexflow/ops/linear_params.h +++ b/include/flexflow/ops/linear_params.h @@ -20,6 +20,7 @@ class LinearParams { float kernel_reg_lambda; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &input_shape) const; void solve_dims(const ParallelTensor input, diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h index 24da4a2c08..949ae0c799 100644 --- a/include/flexflow/ops/residual_layer_norm_params.h +++ b/include/flexflow/ops/residual_layer_norm_params.h @@ -13,6 +13,7 @@ struct ResidualLayerNormParams { float eps; bool use_bias; bool use_two_residuals; + char name[MAX_OPNAME]; bool is_valid(std::tuple const &) const; diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h index 64751a30b0..a4e4de59ab 100644 --- a/include/flexflow/ops/residual_rms_norm_params.h +++ b/include/flexflow/ops/residual_rms_norm_params.h @@ -11,6 +11,7 @@ struct ResidualRMSNormParams { LayerID layer_guid; float eps; int dim; + char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; }; diff --git a/include/flexflow/ops/rms_norm_params.h b/include/flexflow/ops/rms_norm_params.h index 81295322f0..2e4ceecf48 100644 --- a/include/flexflow/ops/rms_norm_params.h +++ b/include/flexflow/ops/rms_norm_params.h @@ -11,6 +11,7 @@ struct RMSNormParams { LayerID layer_guid; float eps; int dim; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/sigmoid_silu_multi_params.h b/include/flexflow/ops/sigmoid_silu_multi_params.h index c8182505b3..eb152db5c1 100644 --- a/include/flexflow/ops/sigmoid_silu_multi_params.h +++ b/include/flexflow/ops/sigmoid_silu_multi_params.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct SigmoidSiluMultiParams { LayerID layer_guid; + char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; }; diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 2f7a706bf1..4d1d78b1dd 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -13,6 +13,7 @@ struct SpecIncMultiHeadSelfAttentionParams { float dropout, scaling_factor; bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, qk_prod_scaling, position_bias; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index 14fcde74ba..d1a51b8b8f 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -16,6 +16,7 @@ struct TreeIncMultiHeadSelfAttentionParams { scaling_query, qk_prod_scaling, position_bias; DataType quantization_type; bool offload; + char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 1921e05323..7ec574edf1 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -263,8 +263,12 @@ void FlexFlow::top_level_task(Task const *task, peft_model_name.empty() ? LoraLinearConfig::DefaultConfig : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); - PEFTModelID peft_model_id = model.register_peft_model( - LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/); + PEFTModelID peft_model_id = + peft_model_name.empty() + ? PEFTModelID::NO_ID + : model.register_peft_model( + LoraLinearConfig::DefaultConfig /*mlp_first*/, + mlp_second /*mlp_second*/); int total_num_requests = 0; { diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 5d19dffdbc..ed682e81fc 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -58,6 +58,9 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -213,7 +216,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( FFModel &model, @@ -1027,6 +1030,8 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -1055,6 +1060,10 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); AddBiasResidualLayerNormParams params; params.layer_guid = layer_guid; @@ -1062,6 +1071,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 5cf4dbdf7c..5e079bfb7f 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -567,7 +567,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.quantization_type, params.offload, params.tensor_parallelism_degree, - name) {} + params.name) {} void IncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -1055,6 +1055,9 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.quantization_type = this->quantization_type; params.offload = this->offload; params.num_kv_heads = this->num_kv_heads; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index f8181570ce..2c8afb6eab 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -190,7 +190,7 @@ Linear::Linear(FFModel &model, params.quantization_type, params.offload, allocate_weights, - name) {} + params.name) {} Linear::Linear(FFModel &model, LayerID const &_layer_guid, @@ -1354,6 +1354,8 @@ void Linear::serialize(Legion::Serializer &sez) const { sez.serialize(this->data_type); sez.serialize(this->quantization_type); sez.serialize(this->offload); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } /* static */ @@ -1384,6 +1386,10 @@ Node Linear::deserialize(FFModel &ff, dez.deserialize(data_type); dez.deserialize(quantization_type); dez.deserialize(offload); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); LinearParams params; params.activation = activation; @@ -1395,6 +1401,7 @@ Node Linear::deserialize(FFModel &ff, params.layer_guid = layer_guid; params.quantization_type = quantization_type; params.offload = offload; + strcpy(params.name, name); return ff.get_or_create_node(inputs[0], params); } @@ -1409,6 +1416,9 @@ LinearParams Linear::get_params() const { params.kernel_reg_lambda = this->kernel_reg_lambda; params.quantization_type = this->quantization_type; params.offload = this->offload; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 4054173c2f..bcdf61b54e 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -674,7 +674,7 @@ Node LoraLinear::deserialize(FFModel &ff, size_t id, transformer_layer_id, deserialized_model_id; OperatorType op_type; size_t name_len; - char name[MAX_OPNAME]; + char name[MAX_OPNAME] = {0}; dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 754b6105fa..1bfd52d107 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -63,6 +63,9 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const { params.eps = this->eps; params.use_bias = this->use_bias; params.use_two_residuals = this->use_two_residuals; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -228,7 +231,7 @@ ResidualLayerNorm::ResidualLayerNorm( params.use_bias, params.eps, allocate_weights, - name) {} + params.name) {} ResidualLayerNorm::ResidualLayerNorm(FFModel &model, LayerID const &_layer_guid, @@ -1069,6 +1072,8 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->eps); sez.serialize(this->use_bias); sez.serialize(this->use_two_residuals); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -1098,6 +1103,10 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(eps); dez.deserialize(use_bias); dez.deserialize(use_two_residuals); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); if (use_two_residuals) { assert(num_inputs == 3); } else { @@ -1111,6 +1120,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, params.eps = eps; params.use_bias = use_bias; params.use_two_residuals = use_two_residuals; + strcpy(params.name, name); if (use_two_residuals) { return ff.get_or_create_node( {inputs[0], inputs[1], inputs[2]}, params); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index a6ed1dca9b..1f05c9bf4d 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -55,6 +55,9 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -141,7 +144,7 @@ ResidualRMSNorm::ResidualRMSNorm( params.eps, params.dim, allocate_weights, - name) {} + params.name) {} ResidualRMSNorm::ResidualRMSNorm( FFModel &model, @@ -460,6 +463,8 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -478,10 +483,15 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); ResidualRMSNormParams params; params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 1a9bd7704e..0d7cc3b7af 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -53,6 +53,9 @@ RMSNormParams RMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -583,6 +586,8 @@ void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -602,10 +607,16 @@ Node RMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); RMSNormParams params; params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + strcpy(params.name, name); + return ff.get_or_create_node(inputs[0], params); } @@ -613,7 +624,7 @@ Op *RMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { RMSNormParams params = get_params(); - return new RMSNorm(ff, params, inputs[0], true, this->name); + return new RMSNorm(ff, params, inputs[0], true, params.name); } bool RMSNorm::measure_operator_cost(Simulator *sim, diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index e36eb36d31..b3771ea267 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -52,6 +52,9 @@ bool SigmoidSiluMultiParams::is_valid( SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { SigmoidSiluMultiParams params; params.layer_guid = this->layer_guid; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } @@ -110,7 +113,7 @@ SigmoidSiluMulti::SigmoidSiluMulti( std::pair const &inputs, char const *name) : SigmoidSiluMulti( - model, params.layer_guid, inputs.first, inputs.second, name) {} + model, params.layer_guid, inputs.first, inputs.second, params.name) {} SigmoidSiluMulti::SigmoidSiluMulti(FFModel &model, LayerID const &_layer_guid, @@ -532,6 +535,8 @@ void SigmoidSiluMulti::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); } using PCG::Node; @@ -546,9 +551,14 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); SigmoidSiluMultiParams params; params.layer_guid = layer_guid; + strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index eb6fd721e6..d4b74f20ae 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -511,7 +511,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.qk_prod_scaling, params.position_bias, allocate_weights, - name) {} + params.name) {} void SpecIncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -853,6 +853,9 @@ SpecIncMultiHeadSelfAttentionParams params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index d5a8a1063d..d0efb01d54 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -562,7 +562,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.quantization_type, params.offload, params.tensor_parallelism_degree, - name) {} + params.name) {} void TreeIncMultiHeadSelfAttention::init_inference( FFModel const &ff, @@ -927,6 +927,9 @@ TreeIncMultiHeadSelfAttentionParams params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; params.tensor_parallelism_degree = this->tensor_parallelism_degree; + if (this->name != nullptr) { + strcpy(params.name, this->name); + } return params; } diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index b58990d32e..cc626c1b42 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2298,6 +2298,8 @@ GraphOptimalViewSerialized sez.serialize(embed->out_channels); sez.serialize(embed->aggr); sez.serialize(embed->data_type); + sez.serialize(strlen(embed->name)); + sez.serialize(embed->name, strlen(embed->name)); break; } case OP_MULTIHEAD_ATTENTION: { @@ -2337,6 +2339,8 @@ GraphOptimalViewSerialized sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); sez.serialize(attn->tensor_parallelism_degree); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2359,6 +2363,8 @@ GraphOptimalViewSerialized sez.serialize(attn->qk_prod_scaling); sez.serialize(attn->position_bias); sez.serialize(attn->num_kv_heads); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { @@ -2384,6 +2390,8 @@ GraphOptimalViewSerialized sez.serialize(attn->offload); sez.serialize(attn->num_kv_heads); sez.serialize(attn->tensor_parallelism_degree); + sez.serialize(strlen(attn->name)); + sez.serialize(attn->name, strlen(attn->name)); break; } case OP_SOFTMAX: { @@ -2656,6 +2664,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(out_channels); dez.deserialize(aggr); dez.deserialize(data_type); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); EmbeddingParams params; params.aggr = aggr; @@ -2663,6 +2675,7 @@ void FFModel::deserialize_graph_optimal_view( params.out_channels = out_channels; params.layer_guid = layer_guid; params.data_type = data_type; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; } @@ -2798,6 +2811,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(offload); dez.deserialize(num_kv_heads); dez.deserialize(tensor_parallelism_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); IncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2818,6 +2835,7 @@ void FFModel::deserialize_graph_optimal_view( params.offload = offload; params.num_kv_heads = num_kv_heads; params.tensor_parallelism_degree = tensor_parallelism_degree; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; } @@ -2846,6 +2864,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(qk_prod_scaling); dez.deserialize(position_bias); dez.deserialize(num_kv_heads); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); SpecIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2863,6 +2885,7 @@ void FFModel::deserialize_graph_optimal_view( params.qk_prod_scaling = qk_prod_scaling; params.position_bias = position_bias; params.num_kv_heads = num_kv_heads; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; @@ -2897,6 +2920,10 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(offload); dez.deserialize(num_kv_heads); dez.deserialize(tensor_parallelism_degree); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); TreeIncMultiHeadSelfAttentionParams params; params.embed_dim = embed_dim; @@ -2917,6 +2944,7 @@ void FFModel::deserialize_graph_optimal_view( params.offload = offload; params.num_kv_heads = num_kv_heads; params.tensor_parallelism_degree = tensor_parallelism_degree; + strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; From 9095f2b5ab2c4528581afd5bb8c0371284f5b78f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 9 Nov 2023 20:39:20 +0000 Subject: [PATCH 071/198] fix peft --- include/flexflow/batch_config.h | 2 +- src/ops/add_bias_residual_layer_norm.cc | 5 +++++ src/ops/fused.cu | 4 ++-- src/ops/inc_multihead_self_attention.cc | 2 +- src/ops/layer_norm.cc | 5 +++++ src/ops/linear.cc | 2 +- src/ops/lora_linear.cc | 2 +- src/ops/residual_layer_norm.cc | 4 ++++ src/ops/residual_rms_norm.cc | 3 +++ src/ops/rms_norm.cc | 3 +++ src/ops/sigmoid_silu_multi.cc | 2 +- src/ops/softmax.cc | 2 +- src/parallel_ops/allreduce.cc | 4 +++- src/runtime/batch_config.cc | 2 +- src/runtime/request_manager.cc | 8 +++++--- 15 files changed, 37 insertions(+), 13 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 60ca550898..a592674b6e 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -61,7 +61,7 @@ class BatchConfig { static int const MAX_NUM_TOKENS = 1024; // Set by update - int num_tokens; + int num_tokens, num_peft_tokens; struct PerRequestInfo { PerRequestInfo() { diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index ed682e81fc..2ce2056050 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -917,6 +917,7 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); int field_id = 0; // output_grad launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, @@ -963,6 +964,10 @@ void AddBiasResidualLayerNorm::peft_bwd_task( std::vector const ®ions, Context ctx, Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } assert(task->regions.size() == regions.size()); AddBiasResidualLayerNormMeta const *m = *((AddBiasResidualLayerNormMeta **)task->local_args); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index e44b9df951..1cb17ec20e 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -699,8 +699,8 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, FusedOp const *fused = metas->fused_op; // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - // Return if no active tokens - if (bc->num_active_tokens() == 0) { + // Return if no active PEFT bwd tokens + if (bc->num_active_peft_tokens() == 0) { return; } diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 5e079bfb7f..d2c1209ade 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -948,7 +948,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task( log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", bc->num_tokens, bc->num_active_requests()); - if (bc->num_tokens == 0) { + if (bc->num_active_peft_tokens() == 0) { return; } diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 0a467f0984..ba2d43022f 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -669,6 +669,7 @@ Legion::FutureMap false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); // regions[0](I): output_grad launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, @@ -704,6 +705,10 @@ void LayerNorm::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); assert(task->regions.size() == regions.size()); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 2c8afb6eab..86f958a433 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -736,7 +736,7 @@ void Linear::peft_bwd_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta const *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_tokens == 0) { + if (bc->num_active_peft_tokens() == 0) { return; } assert(regions.size() == (3 + static_cast(m->use_bias))); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index bcdf61b54e..5870243ade 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -609,7 +609,7 @@ void LoraLinear::peft_bwd_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_tokens == 0) { + if (bc->num_active_peft_tokens() == 0) { return; } assert(regions.size() == 6); diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 1bfd52d107..e3b599d10f 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -758,6 +758,10 @@ void ResidualLayerNorm::peft_bwd_task( std::vector const ®ions, Context ctx, Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } assert(task->regions.size() == regions.size()); ResidualLayerNormMeta const *m = *((ResidualLayerNormMeta **)task->local_args); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 1f05c9bf4d..8013c0e81a 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -672,6 +672,9 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, assert(regions.size() == 4); ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW residual_input0_grad = diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 0d7cc3b7af..fe6944aa90 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -571,6 +571,9 @@ void RMSNorm::peft_bwd_task(Task const *task, assert(regions.size() == 3); RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index b3771ea267..14c202f784 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -411,7 +411,7 @@ void SigmoidSiluMulti::peft_bwd_task(Task const *task, SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_active_peft_tokens() <= 0) { + if (bc->num_active_peft_tokens() == 0) { return; } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index d0e38c8017..ae75849f85 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -431,7 +431,7 @@ void Softmax::peft_bwd_task(Task const *task, assert(regions.size() == 2); assert(task->regions.size() == 2); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_tokens == 0) { + if (bc->num_active_peft_tokens() == 0) { return; } Domain in_domain = runtime->get_index_space_domain( diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 62e152b36c..7f147dad6f 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -387,7 +387,9 @@ void AllReduce::peft_bwd_task(Task const *task, AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - + if (bc->num_active_peft_tokens() == 0) { + return; + } GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 22ab420674..20c0307a58 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -79,7 +79,7 @@ int BatchConfig::num_active_infr_tokens() const { } int BatchConfig::num_active_peft_tokens() const { - return 0; + return num_peft_tokens; } /*static*/ diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index df8d43bc38..e8adfcbded 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -607,17 +607,19 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, for (size_t i = 0; i < request.dataset[0].first.size(); i++) { new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].first[i]; - new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens; + new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i; new_bc.num_tokens++; + new_bc.num_peft_tokens++; } for (size_t i = 0; i < request.dataset[0].second.size(); i++) { new_bc.tokensInfo[new_bc.num_tokens].token_id = request.dataset[0].second[i]; - new_bc.tokensInfo[new_bc.num_tokens].request_index = num_peft_tokens; + new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx; int depth = request.dataset[0].first.size() + i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; new_bc.num_tokens++; + new_bc.num_peft_tokens++; } } } @@ -2119,7 +2121,7 @@ GenerationResult RequestManager::generate_incr_decoding( BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second); FutureMap fm = im->inference(llm, 0, bcf); - // im->peft_bwd(llm, 0, bcf); + im->peft_bwd(llm, 0, bcf); assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); batch_pipeline.push(std::make_pair(bcf, irf)); From 97696041181d32679e1d1d0a8d7cf3cc2e1b8a97 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 03:43:02 +0000 Subject: [PATCH 072/198] fix bwd bugs --- src/ops/inc_multihead_self_attention.cu | 7 ++++++- src/ops/kernels/linear_kernels.cu | 12 +++++++++--- src/runtime/cuda_helper.cu | 22 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index c406435327..1a93251db4 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -469,8 +469,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; #else - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index dad6dc4e00..6f4016f2c2 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -457,14 +457,20 @@ void peft_bwd_kernel(LinearMeta const *m, cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); // update input_grad_ptr and output_grad_ptr offset - input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens * in_dim; + int num_infr_only_tokens = num_infr_tokens - num_peft_tokens; + input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; output_grad_ptr = - static_cast
(output_grad_ptr) + num_infr_tokens * out_dim; + static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = output_type; #else - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index c2b2affc40..e2078fa663 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -608,6 +608,28 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) { return CUDNN_DATA_FLOAT; } +void check_device_vs_host_ptr(void const *maybe_devicePtr) { + cudaPointerAttributes attributes; + cudaError_t cudaStatus = cudaPointerGetAttributes(&attributes, maybe_devicePtr); + + if (cudaStatus == cudaSuccess) { + // Check attributes and perform actions accordingly + if (attributes.type == cudaMemoryTypeDevice) { + printf("Pointer is allocated in device memory.\n"); + } else if (attributes.type == cudaMemoryTypeHost) { + printf("Pointer is allocated in host memory.\n"); + } else if (attributes.type == cudaMemoryTypeUnregistered) { + printf("Pointer is unregistered.\n"); + } else if (attributes.type == cudaMemoryTypeManaged) { + printf("Pointer is managed.\n"); + } else { + printf("Pointer is not allocated in recognized memory type.\n"); + } + } else { + fprintf(stderr, "cudaPointerGetAttributes failed: %s\n", cudaGetErrorString(cudaStatus)); + } +} + template __global__ void assign_kernel(half *ptr, coord_t size, half value); template __global__ void From 880ede8541a26c976dcb57e9e7655fb93ed8d67f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 03:43:28 +0000 Subject: [PATCH 073/198] linting --- src/ops/kernels/linear_kernels.cu | 3 ++- src/runtime/cuda_helper.cu | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 6f4016f2c2..e56c4124d6 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -458,7 +458,8 @@ void peft_bwd_kernel(LinearMeta const *m, cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); // update input_grad_ptr and output_grad_ptr offset int num_infr_only_tokens = num_infr_tokens - num_peft_tokens; - input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; + input_grad_ptr = + static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; output_grad_ptr = static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index e2078fa663..0de6d9bc63 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -610,7 +610,8 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) { void check_device_vs_host_ptr(void const *maybe_devicePtr) { cudaPointerAttributes attributes; - cudaError_t cudaStatus = cudaPointerGetAttributes(&attributes, maybe_devicePtr); + cudaError_t cudaStatus = + cudaPointerGetAttributes(&attributes, maybe_devicePtr); if (cudaStatus == cudaSuccess) { // Check attributes and perform actions accordingly @@ -626,9 +627,11 @@ void check_device_vs_host_ptr(void const *maybe_devicePtr) { printf("Pointer is not allocated in recognized memory type.\n"); } } else { - fprintf(stderr, "cudaPointerGetAttributes failed: %s\n", cudaGetErrorString(cudaStatus)); + fprintf(stderr, + "cudaPointerGetAttributes failed: %s\n", + cudaGetErrorString(cudaStatus)); } -} +} template __global__ void assign_kernel(half *ptr, coord_t size, half value); From 818375de38e6fa6d0f0495901ba04a6a592102c5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 03:53:50 +0000 Subject: [PATCH 074/198] fixes --- inference/incr_decoding/incr_decoding.cc | 8 ++++---- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/lora_linear.cc | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 7ec574edf1..ed2b4705df 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -280,16 +280,16 @@ void FlexFlow::top_level_task(Task const *task, /*allow_exceptions */ true, /*ignore_comments */ true); std::vector prompts; - // std::vector> dataset; + std::vector> dataset; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; prompts.push_back(text); - // dataset.push_back(std::make_pair(text, text)); + dataset.push_back(std::make_pair(text, text)); } - // rm->register_new_peft_request(dataset, 256 /*max_sequence_length*/, - // peft_model_id); + rm->register_new_peft_request( + dataset, 256 /*max_sequence_length*/, peft_model_id); // for (auto &prompt : prompts) { // GenerationResult result = model.generate(prompt, 128 // /*max_sequence_length*/); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 1a93251db4..92a1f37097 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1294,7 +1294,7 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( bias_ptr, stream); } else if (input_grad.data_type == DT_FLOAT) { - assert(m->offload); + assert(!m->offload); float const *bias_ptr = use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 5870243ade..eb14517fab 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -612,7 +612,7 @@ void LoraLinear::peft_bwd_task(Task const *task, if (bc->num_active_peft_tokens() == 0) { return; } - assert(regions.size() == 6); + assert(regions.size() == 2); assert(task->regions.size() == regions.size()); assert(m->input_type[0] == m->output_type[0]); From 2990e2054781af0fa67c1114fb192cc798322e9e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 04:02:06 +0000 Subject: [PATCH 075/198] fix --- src/ops/rms_norm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index fe6944aa90..1e1de42b9a 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -577,9 +577,9 @@ void RMSNorm::peft_bwd_task(Task const *task, GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight); } From 6959e6864b1f5ad184890dd715a7caf58d10d049 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 04:23:04 +0000 Subject: [PATCH 076/198] fix --- src/ops/rms_norm.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 1e1de42b9a..e6df27d49a 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -523,7 +523,10 @@ Legion::FutureMap ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; - set_argumentmap_for_backward(ff, argmap); + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, parallel_is, TaskArgument(NULL, 0), @@ -531,7 +534,7 @@ Legion::FutureMap Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); + machine_view_hash); launcher.add_future(bc); // regions[0](I): output_grad launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, From 266368c69fbb7fcd8d02e06bce33cd22414f98f3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 04:51:54 +0000 Subject: [PATCH 077/198] fix --- include/flexflow/utils/cuda_helper.h | 1 + src/ops/kernels/rms_norm_kernels.cu | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 983c20525e..999bc27634 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -182,3 +182,4 @@ ncclDataType_t ff_to_nccl_datatype(DataType type); cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type); cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type); #endif +void check_device_vs_host_ptr(void const *maybe_devicePtr); diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 8281506cbf..c9e0e02678 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -41,11 +41,15 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, DataType data_type = rms->weights[0]->data_type; size_t rms_ptr_size = batch_size; + size_t c2_ptr_size = rms_ptr_size; size_t norm_ptr_size = num_elements; - size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); + size_t totalSize = + (rms_ptr_size + c2_ptr_size + norm_ptr_size) * data_type_size(data_type); gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); rms_ptr = gpu_mem_allocator.allocate_instance_untyped( rms_ptr_size * data_type_size(data_type)); + c2_ptr = gpu_mem_allocator.allocate_instance_untyped( + c2_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); } @@ -473,6 +477,11 @@ void peft_bwd_kernel(RMSNormMeta const *m, const int64_t M = bc->requestsInfo[i].num_tokens_in_batch; const int64_t N = m->num_elements; + check_device_vs_host_ptr(output_grad_ptr); + check_device_vs_host_ptr(m->input_activation); + check_device_vs_host_ptr(weight_ptr); + check_device_vs_host_ptr(m->rms_ptr); + check_device_vs_host_ptr(m->c2_ptr); ComputeInternalGradientsCUDAKernel <<>>( N, From 06775bdd7af17d111893ed5d3c59fe86c015862d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 15:01:47 +0000 Subject: [PATCH 078/198] add bc fields for peft training --- include/flexflow/request_manager.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index a955eb0b9f..fa8c8ebeb7 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -57,6 +57,10 @@ struct Request { COMPLETED = 103, // finished and verified FINISHING = 104, // finishing request, but not yet verified }; + enum RequestType { + REQ_INFERENCE = 201, + REQ_FINETUNING = 202 + }; BatchConfig::RequestGuid guid; PEFTModelID peft_model_id; int max_sequence_length; @@ -68,6 +72,9 @@ struct Request { std::vector tokens; std::vector beam_trees; // PEFT field + RequestType req_type = REQ_INFERENCE; + int completed_training_steps = 0; + int max_training_steps = 1; std::vector, std::vector>> dataset; From 9f601770949faa84407cbad10bdb03717bef93c0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 15:31:19 +0000 Subject: [PATCH 079/198] linting --- include/flexflow/request_manager.h | 5 +- src/runtime/request_manager.cc | 113 ++++++++++++++++------------- 2 files changed, 63 insertions(+), 55 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index fa8c8ebeb7..0aa654f9e7 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -57,10 +57,7 @@ struct Request { COMPLETED = 103, // finished and verified FINISHING = 104, // finishing request, but not yet verified }; - enum RequestType { - REQ_INFERENCE = 201, - REQ_FINETUNING = 202 - }; + enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 }; BatchConfig::RequestGuid guid; PEFTModelID peft_model_id; int max_sequence_length; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index f2b9c1ee52..faf99f37e5 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -425,7 +425,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (request.req_type == Request::REQ_FINETUNING) { // No new tokens generated when in fine-tuning mode continue; - } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { + } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < + request.tokens.size()) { // This is a prompt token continue; } else { @@ -449,31 +450,34 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; if (request.req_type == Request::REQ_FINETUNING) { - // fine-tuning requests don't automatically carry over to the next batch, - // we only do so if there is space left after adding new inference requests + // fine-tuning requests don't automatically carry over to the next + // batch, we only do so if there is space left after adding new + // inference requests request.completed_training_steps += 1; assert(request.completed_training_steps <= request.max_training_steps); if (request.completed_training_steps == request.max_training_steps) { // check if the fine tuning request has completed request.status = Request::COMPLETED; log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%zu)", - old_bc.requestsInfo[i].request_guid, - request.completed_training_steps); + old_bc.requestsInfo[i].request_guid, + request.completed_training_steps); GenerationResult &gr = request_generation_results[request.guid]; assert(gr.guid == request.guid); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.finish_time = + Realm::Clock::current_time_in_microseconds(); total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.completed_training_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.completed_training_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); } } else { int processed_tokens = @@ -482,7 +486,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(processed_tokens < request.tokens.size()); bool request_completed = false; // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + if (request.tokens.size() >= + old_bc.requestsInfo[i].max_sequence_length) { request_completed = true; } else if (request.tokens.back() == eos_token_id) { // Encounter EOS token id @@ -511,47 +516,51 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, log_req_mgr.print("Final output: %s", output.c_str()); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.finish_time = + Realm::Clock::current_time_in_microseconds(); total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print( + "[Profile] guid(%zu) decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); // Write output to file if needed: if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath, std::ios::app); - if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " << profile_info.decoding_steps - << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; + std::ofstream outputFile(output_filepath, std::ios::app); + if (outputFile.is_open()) { + outputFile << "end-to-end latency: " << std::fixed + << std::setprecision(3) << total_request_run_time + << std::endl; + outputFile << "num decoding steps: " + << profile_info.decoding_steps << std::endl; + outputFile << "token IDs: "; + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } } + outputFile << std::endl; + outputFile << output; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); } - outputFile << std::endl; - outputFile << output; - outputFile.close(); - } else { - std::cout << "Unable to open the output file: " << output_filepath - << std::endl; - assert(false); } - } } else { new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[i].first_token_depth_in_request = + processed_tokens; + new_bc.requestsInfo[i].first_token_offset_in_batch = + new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].peft_model_id = @@ -565,17 +574,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].num_tokens_in_batch = 1; } else { // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, - (int)request.tokens.size() - - new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].num_tokens_in_batch = std::min( + get_max_tokens_per_batch() - new_bc.num_tokens, + (int)request.tokens.size() - + new_bc.requestsInfo[i].first_token_depth_in_request); } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[depth]; new_bc.num_tokens++; } // Update profiling @@ -625,7 +635,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } // Step 4: add PEFT bwd requests, if there is additional space - while(pending_peft_request_queue.size() > 0) { + while (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); assert(request.req_type = Request::REQ_FINETUNING); if (request.status == Request::COMPLETED) { @@ -638,7 +648,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request &request = pending_peft_request_queue.front(); assert(request.req_type = Request::REQ_FINETUNING); assert(request.dataset.size() > 0); - assert(request.max_training_steps > 0 && request.completed_training_steps < max_training_steps); + assert(request.max_training_steps > 0 && + request.completed_training_steps < max_training_steps); int num_peft_tokens = request.dataset[0].first.size() + request.dataset[0].second.size(); if (num_peft_tokens + new_bc.num_active_tokens() <= From 9442b62c40831fd008a489a946032284d4e2e281 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 15:36:33 +0000 Subject: [PATCH 080/198] fix --- src/runtime/request_manager.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index faf99f37e5..a224f400c6 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -458,7 +458,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (request.completed_training_steps == request.max_training_steps) { // check if the fine tuning request has completed request.status = Request::COMPLETED; - log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%zu)", + log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)", old_bc.requestsInfo[i].request_guid, request.completed_training_steps); GenerationResult &gr = request_generation_results[request.guid]; @@ -474,7 +474,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, "[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) " "finish(%.1lf) latency(%.1lf)", request.guid, - profile_info.completed_training_steps, + request.completed_training_steps, profile_info.start_time, profile_info.finish_time, profile_info.finish_time - profile_info.start_time); @@ -599,10 +599,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Step 3: add new requests to the next batch if there is space for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && + if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); + Request new_request = pending_infr_request_queue.front(); + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; @@ -649,7 +649,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(request.req_type = Request::REQ_FINETUNING); assert(request.dataset.size() > 0); assert(request.max_training_steps > 0 && - request.completed_training_steps < max_training_steps); + request.completed_training_steps < request.max_training_steps); int num_peft_tokens = request.dataset[0].first.size() + request.dataset[0].second.size(); if (num_peft_tokens + new_bc.num_active_tokens() <= From 11eccb1d269792b390505411bf2d7e83ddb4dd9b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 15:38:04 +0000 Subject: [PATCH 081/198] remove ptr check --- src/ops/kernels/rms_norm_kernels.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index c9e0e02678..ae6a5d590d 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -477,11 +477,6 @@ void peft_bwd_kernel(RMSNormMeta const *m, const int64_t M = bc->requestsInfo[i].num_tokens_in_batch; const int64_t N = m->num_elements; - check_device_vs_host_ptr(output_grad_ptr); - check_device_vs_host_ptr(m->input_activation); - check_device_vs_host_ptr(weight_ptr); - check_device_vs_host_ptr(m->rms_ptr); - check_device_vs_host_ptr(m->c2_ptr); ComputeInternalGradientsCUDAKernel <<>>( N, From 9bfc557eafd7c7366b258cd76981de8a19734e7c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 Nov 2023 16:01:44 +0000 Subject: [PATCH 082/198] fix --- src/runtime/request_manager.cc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index a224f400c6..b62172eac3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -416,7 +416,6 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); - // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { size_t guid = @@ -638,7 +637,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, while (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); assert(request.req_type = Request::REQ_FINETUNING); - if (request.status == Request::COMPLETED) { + Request &all_req_handle = all_requests[request.guid]; + assert(all_req_handle.req_type = Request::REQ_FINETUNING); + if (all_req_handle.status == Request::COMPLETED) { pending_peft_request_queue.pop(); } else { break; @@ -648,6 +649,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, Request &request = pending_peft_request_queue.front(); assert(request.req_type = Request::REQ_FINETUNING); assert(request.dataset.size() > 0); + // update status and training steps + Request &all_req_handle = all_requests[request.guid]; + assert(all_req_handle.req_type = Request::REQ_FINETUNING); + request.completed_training_steps = all_req_handle.completed_training_steps; + request.status = all_req_handle.status; + assert(request.status != Request::COMPLETED); assert(request.max_training_steps > 0 && request.completed_training_steps < request.max_training_steps); int num_peft_tokens = From bcfae08f4f15a4f53473fd5ee8cdc58d3379e8fe Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 12 Nov 2023 21:12:16 +0000 Subject: [PATCH 083/198] implement save_operators for bwd --- include/flexflow/op_meta.h | 1 + include/flexflow/operator.h | 15 ++++++++++----- src/runtime/model.cc | 2 ++ src/runtime/request_manager.cc | 3 +++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h index dcf070c975..d31c12b16c 100644 --- a/include/flexflow/op_meta.h +++ b/include/flexflow/op_meta.h @@ -17,6 +17,7 @@ class OpMeta { bool profiling; // Measure the run time of the task bool inference_debugging; int decoding_step; + int bwd_step; char op_name[MAX_OPNAME]; LayerID layer_guid; bool trainable_inputs[MAX_NUM_INPUTS]; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 388f9dcd6a..9d54996bf0 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -249,7 +249,8 @@ class Op { BatchConfig const *bc, std::vector input_tensors, std::vector weight_tensors, - std::vector output_tensors) { + std::vector output_tensors, + bool fwd_pass=true) { // Check if output directory exists, and create it if it does not char const *folder_path = "./inference_tensors"; struct stat st = {0}; @@ -270,7 +271,7 @@ class Op { op_name_without_uid.erase(last_underscore); std::string base_filepath = "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + + "_decoding-step_" + (fwd_pass ? std::to_string(m->decoding_step) : std::to_string(m->bwd_step)) + "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + op_name_without_uid + "_shard-id_" + std::to_string(shard_id); // save batch config, if passed @@ -300,8 +301,8 @@ class Op { assert(false && "Tensor data type not supported"); } } - // only dump the weights once - if (m->decoding_step == 0) { + // only dump the weights once (in fwd passes) + if (fwd_pass && m->decoding_step == 0) { for (int i = 0; i < weight_tensors.size(); i++) { std::string filename = base_filepath + "_weight_" + std::to_string(i); if (weight_tensors[i].data_type == DT_FLOAT) { @@ -349,7 +350,11 @@ class Op { } } // increase count of decoding steps - m->decoding_step++; + if (fwd_pass) { + m->decoding_step++; + } else { + m->bwd_step++; + } } virtual bool measure_operator_cost(Simulator *sim, MachineView const &mv, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4ccfe25a97..2ee4d4bc08 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1482,6 +1482,7 @@ OpMeta::OpMeta(FFHandler _handle) output_type[i] = DT_NONE; } decoding_step = 0; + bwd_step = 0; } #endif @@ -1502,6 +1503,7 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op) output_type[i] = op->outputs[i]->data_type; } decoding_step = 0; + bwd_step = 0; } FFRuntime::FFRuntime(FFConfig &config) { diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index b62172eac3..9e38235bbb 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -601,6 +601,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_infr_request_queue.front(); + assert(new_request.req_type == Request::REQ_INFERENCE); pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; @@ -611,6 +612,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; + new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; // add profile_info for the new request ProfileInfo profile_info; From d86272c69c850c50a2b9bbf0fa0038ebf3720c83 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 13 Nov 2023 00:11:41 +0000 Subject: [PATCH 084/198] fix bug --- src/runtime/batch_config.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 20c0307a58..e37ab9aed3 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -25,7 +25,7 @@ LegionRuntime::Logger::Category log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; -BatchConfig::BatchConfig() : num_tokens(0) { +BatchConfig::BatchConfig() : num_tokens(0), num_peft_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { requestsInfo[i].first_token_depth_in_request = 0; requestsInfo[i].first_token_offset_in_batch = 0; From 0a3258a932b9c069198182698f2b1fd420589bf9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 13 Nov 2023 01:17:58 +0000 Subject: [PATCH 085/198] implement save tensors for bwd --- include/flexflow/batch_config.h | 2 +- include/flexflow/operator.h | 13 ++-- inference/incr_decoding/incr_decoding.cc | 60 ++++++++++----- src/ops/add_bias_residual_layer_norm.cc | 20 ++++- src/ops/element_unary.cc | 2 +- src/ops/fused.cu | 2 +- src/ops/group_by.cc | 2 +- src/ops/inc_multihead_self_attention.cc | 7 ++ src/ops/linear.cc | 8 +- src/ops/lora_linear.cc | 93 ++++++++++++++++++++++++ src/ops/residual_layer_norm.cc | 25 ++++++- src/ops/residual_rms_norm.cc | 14 +++- src/ops/rms_norm.cc | 8 +- src/ops/sigmoid_silu_multi.cc | 12 +++ src/ops/softmax.cc | 6 ++ 15 files changed, 241 insertions(+), 33 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index a592674b6e..8ddcec7d53 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -61,7 +61,7 @@ class BatchConfig { static int const MAX_NUM_TOKENS = 1024; // Set by update - int num_tokens, num_peft_tokens; + int num_tokens = 0, num_peft_tokens = 0; struct PerRequestInfo { PerRequestInfo() { diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 9d54996bf0..af39412232 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -249,8 +249,8 @@ class Op { BatchConfig const *bc, std::vector input_tensors, std::vector weight_tensors, - std::vector output_tensors, - bool fwd_pass=true) { + std::vector output_tensors, + bool fwd_pass = true) { // Check if output directory exists, and create it if it does not char const *folder_path = "./inference_tensors"; struct stat st = {0}; @@ -271,9 +271,12 @@ class Op { op_name_without_uid.erase(last_underscore); std::string base_filepath = "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + (fwd_pass ? std::to_string(m->decoding_step) : std::to_string(m->bwd_step)) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - op_name_without_uid + "_shard-id_" + std::to_string(shard_id); + (fwd_pass ? "_decoding-step_" : "_bwd-step_") + + (fwd_pass ? std::to_string(m->decoding_step) + : std::to_string(m->bwd_step)) + + "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + + "_layer-name_" + op_name_without_uid + "_shard-id_" + + std::to_string(shard_id); // save batch config, if passed if (bc != nullptr) { bc->save_to_file(base_filepath + "_batch-config"); diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index ed2b4705df..045f5de3c8 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -32,6 +32,7 @@ LegionRuntime::Logger::Category log_app("llama"); struct FilePaths { std::string cache_folder_path; std::string prompt_file_path; + std::string peft_dataset_path; std::string output_file_path; }; @@ -74,6 +75,11 @@ void parse_input_args(char **argv, paths.prompt_file_path = std::string(argv[++i]); continue; } + // PEFT dataset + if (!strcmp(argv[i], "-peft-dataset")) { + paths.peft_dataset_path = std::string(argv[++i]); + continue; + } // output file if (!strcmp(argv[i], "-output-file")) { paths.output_file_path = std::string(argv[++i]); @@ -271,29 +277,47 @@ void FlexFlow::top_level_task(Task const *task, mlp_second /*mlp_second*/); int total_num_requests = 0; + int total_dataset_entries = 0; { using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); + std::vector prompts; std::vector> dataset; - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - total_num_requests++; - prompts.push_back(text); - dataset.push_back(std::make_pair(text, text)); + + // Load prompts for inference + if (!file_paths.prompt_file_path.empty()) { + std::ifstream prompt_file_handle(file_paths.prompt_file_path); + assert(prompt_file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(prompt_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + total_num_requests++; + prompts.push_back(text); + } + } + // Load HF dataset for PEFT training + if (!file_paths.peft_dataset_path.empty()) { + std::ifstream prompt_file_handle(file_paths.peft_dataset_path); + assert(prompt_file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(prompt_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Training dataset entry [%d]: %s\n", + total_dataset_entries, + text.c_str()); + total_dataset_entries++; + dataset.push_back(std::make_pair(text, text)); + rm->register_new_peft_request( + dataset, 256 /*max_sequence_length*/, peft_model_id); + } } - rm->register_new_peft_request( - dataset, 256 /*max_sequence_length*/, peft_model_id); - // for (auto &prompt : prompts) { - // GenerationResult result = model.generate(prompt, 128 - // /*max_sequence_length*/); - // } GenerationResult result = model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id); } diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 2ce2056050..82c71f517f 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -969,7 +969,7 @@ void AddBiasResidualLayerNorm::peft_bwd_task( return; } assert(task->regions.size() == regions.size()); - AddBiasResidualLayerNormMeta const *m = + AddBiasResidualLayerNormMeta *m = *((AddBiasResidualLayerNormMeta **)task->local_args); assert(regions.size() == 4 + m->elementwise_affine); @@ -1017,6 +1017,24 @@ void AddBiasResidualLayerNorm::peft_bwd_task( } AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + weights_accessors.push_back(attn_bias_grad); + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + } + AddBiasResidualLayerNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad, residual_grad}, + weights_accessors, + {output_grad}, + false /*fwd_pass*/); + } } bool AddBiasResidualLayerNorm::measure_operator_cost( diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 844aeb6de3..c643da5625 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -557,7 +557,7 @@ void ElementUnary::forward_task_with_type( assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector input_accessors; - std::vector output_accessors; + std::vector output_accessors; if (m->inplace) { GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 1cb17ec20e..9954a8b43a 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -648,7 +648,7 @@ __host__ void if (metas->meta[op]->inference_debugging) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; - std::vector output_accessors_to_save; + std::vector output_accessors_to_save; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index 75960e7dcd..779d0d8f5d 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -396,7 +396,7 @@ void Group_by::forward_task(Task const *task, // Create a vector of n outputs, where n is the number of experts. // Each entry in the "outputs" vector points to the Legion tensor that will // contain the tockens dispatched to the corresponding expert - std::vector output_accessors; + std::vector output_accessors; float *outputs[n]; for (int i = 0; i < n; i++) { GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index d2c1209ade..b66d524303 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -998,6 +998,13 @@ void IncMultiHeadSelfAttention::peft_bwd_task( weight, output_grad, biases); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + IncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + } } void IncMultiHeadSelfAttention::backward(FFModel const &ff) { diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 86f958a433..0887b6d35b 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -734,7 +734,7 @@ void Linear::peft_bwd_task(Task const *task, Runtime *runtime) { Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - LinearMeta const *m = *((LinearMeta **)task->local_args); + LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_peft_tokens() == 0) { return; @@ -765,6 +765,12 @@ void Linear::peft_bwd_task(Task const *task, out_dim, num_infr_tokens, num_peft_tokens); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Linear::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + } } void Linear::forward_task(Task const *task, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index eb14517fab..05edeab833 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -626,6 +626,99 @@ void LoraLinear::peft_bwd_task(Task const *task, // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + + // Check if output directory exists, and create it if it does not + char const *folder_path = "./inference_tensors"; + struct stat st = {0}; + if (stat(folder_path, &st) == -1) { + // Directory does not exist, create it + mkdir(folder_path, 0700); + } + + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + + // output base filepath, shared by all tensors from the same operator + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_bwd-step_" + std::to_string(m->bwd_step) + "_layer-num_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + + lora_layername_substr + "_shard-id_" + std::to_string(shard_id); + + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(base_filepath + "_batch-config"); + } + + std::string filename = base_filepath + "_input_" + std::to_string(0); + if (input_grad.data_type == DT_FLOAT) { + save_tensor(input_grad.get_float_ptr(), + input_grad.domain.get_volume(), + filename.c_str()); + } else if (input_grad.data_type == DT_HALF) { + save_tensor(input_grad.get_half_ptr(), + input_grad.domain.get_volume(), + filename.c_str()); + } else { + assert(false); + } + + // std::cout << "base_filepath: " << base_filepath << std::endl; + // std::cout << "m->decoding_step: " << m->decoding_step << std::endl; + if (m->bwd_step == 0) { + for (auto it = m->model_weights.begin(); it != m->model_weights.end(); + ++it) { + PEFTModelID peft_model_id = it->first; + LoraLinearWeight weight = m->model_weights[peft_model_id]; + std::string filenameA = base_filepath + "_weight_A"; + std::string filenameB = base_filepath + "_weight_B"; + if (m->input_type[0] == DT_FLOAT) { + save_tensor((float *)weight.w0_grad_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((float *)weight.w1_grad_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); + } else if (m->input_type[0] == DT_HALF) { + save_tensor((half *)weight.w0_grad_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((half *)weight.w1_grad_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); + } else { + assert(false && "Data type not supported"); + } + } + } + + filename = base_filepath + "_output_" + std::to_string(0); + if (output_grad.data_type == DT_FLOAT) { + save_tensor(output_grad.get_float_ptr(), + output_grad.domain.get_volume(), + filename.c_str()); + } else if (output_grad.data_type == DT_HALF) { + save_tensor(output_grad.get_half_ptr(), + output_grad.domain.get_volume(), + filename.c_str()); + } else { + assert(false); + } + m->bwd_step++; + } } void LoraLinear::backward(FFModel const &ff) { diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index e3b599d10f..4bee47de6c 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -763,8 +763,7 @@ void ResidualLayerNorm::peft_bwd_task( return; } assert(task->regions.size() == regions.size()); - ResidualLayerNormMeta const *m = - *((ResidualLayerNormMeta **)task->local_args); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); assert(regions.size() == 4 + m->use_two_residuals + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); @@ -814,6 +813,28 @@ void ResidualLayerNorm::peft_bwd_task( } ResidualLayerNorm::peft_bwd_kernel_wrapper( m, output_grad, input_grad, residual1_grad, residual2_grad, gamma); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector input_accessors; + input_accessors.push_back(input_grad); + input_accessors.push_back(residual1_grad); + if (m->use_two_residuals) { + input_accessors.push_back(residual2_grad); + } + std::vector weights_accessors; + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + } + ResidualLayerNorm::save_inference_tensors_to_file(m, + shard_id, + bc, + input_accessors, + weights_accessors, + {output_grad}, + false); + } } Op *ResidualLayerNorm::materialize(FFModel &ff, diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 8013c0e81a..a57b9248c7 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -670,7 +670,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, Runtime *runtime) { assert(task->regions.size() == 4); assert(regions.size() == 4); - ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_peft_tokens() == 0) { return; @@ -695,6 +695,18 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper( m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {residual_input0_grad, residual_input1_grad}, + {weight}, + {output_grad}, + false); + } } Op *ResidualRMSNorm::materialize(FFModel &ff, diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index e6df27d49a..5a8cfe8eff 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -572,7 +572,7 @@ void RMSNorm::peft_bwd_task(Task const *task, Runtime *runtime) { assert(task->regions.size() == 3); assert(regions.size() == 3); - RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + RMSNormMeta *m = *((RMSNormMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_peft_tokens() == 0) { return; @@ -584,6 +584,12 @@ void RMSNorm::peft_bwd_task(Task const *task, GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + RMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + } } void RMSNorm::serialize(Legion::Serializer &sez) const { diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 14c202f784..d064bd0a1c 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -424,6 +424,18 @@ void SigmoidSiluMulti::peft_bwd_task(Task const *task, SigmoidSiluMulti::peft_bwd_kernel_wrapper( m, bc, output_grad, input1_grad, input2_grad); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + SigmoidSiluMulti::save_inference_tensors_to_file(m, + shard_id, + nullptr, + {input1_grad, input2_grad}, + {}, + {output_grad}, + false); + } } FutureMap SigmoidSiluMulti::inference( diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index ae75849f85..88ffec3642 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -442,6 +442,12 @@ void Softmax::peft_bwd_task(Task const *task, GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Softmax::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); + } } bool Softmax::get_int_parameter(PMParameter para, int *value) const { From e34c40541e59dc8ff342aa9a228cf6ddb8938c22 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 15 Nov 2023 00:44:50 +0000 Subject: [PATCH 086/198] . --- inference/incr_decoding/incr_decoding.cc | 60 +++------ tests/peft/hf_finetune.py | 152 +++++++++++++++++++---- 2 files changed, 149 insertions(+), 63 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 045f5de3c8..c76637a62c 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -32,7 +32,6 @@ LegionRuntime::Logger::Category log_app("llama"); struct FilePaths { std::string cache_folder_path; std::string prompt_file_path; - std::string peft_dataset_path; std::string output_file_path; }; @@ -75,11 +74,6 @@ void parse_input_args(char **argv, paths.prompt_file_path = std::string(argv[++i]); continue; } - // PEFT dataset - if (!strcmp(argv[i], "-peft-dataset")) { - paths.peft_dataset_path = std::string(argv[++i]); - continue; - } // output file if (!strcmp(argv[i], "-output-file")) { paths.output_file_path = std::string(argv[++i]); @@ -277,47 +271,29 @@ void FlexFlow::top_level_task(Task const *task, mlp_second /*mlp_second*/); int total_num_requests = 0; - int total_dataset_entries = 0; { using json = nlohmann::json; - + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); std::vector prompts; std::vector> dataset; - - // Load prompts for inference - if (!file_paths.prompt_file_path.empty()) { - std::ifstream prompt_file_handle(file_paths.prompt_file_path); - assert(prompt_file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(prompt_file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - total_num_requests++; - prompts.push_back(text); - } - } - // Load HF dataset for PEFT training - if (!file_paths.peft_dataset_path.empty()) { - std::ifstream prompt_file_handle(file_paths.peft_dataset_path); - assert(prompt_file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(prompt_file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - for (auto &prompt : prompt_json) { - std::string text = prompt.get(); - printf("Training dataset entry [%d]: %s\n", - total_dataset_entries, - text.c_str()); - total_dataset_entries++; - dataset.push_back(std::make_pair(text, text)); - rm->register_new_peft_request( - dataset, 256 /*max_sequence_length*/, peft_model_id); - } + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + total_num_requests++; + //prompts.push_back(text); + dataset.push_back(std::make_pair(text, text)); } + rm->register_new_peft_request( + dataset, 256 /*max_sequence_length*/, peft_model_id); + // for (auto &prompt : prompts) { + // GenerationResult result = model.generate(prompt, 128 + // /*max_sequence_length*/); + // } GenerationResult result = model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id); } diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index cf157a8913..3fe01db283 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -1,11 +1,17 @@ -import os, sys -#os.environ["CUDA_VISIBLE_DEVICES"]="0" +import os, sys, shutil import torch +# Reproducibility +import random +import numpy as np +torch.manual_seed(0) +random.seed(0) +np.random.seed(0) +#torch.use_deterministic_algorithms(True) import torch.nn as nn #import bitsandbytes as bnb from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer import argparse -from peft import LoraConfig, get_peft_model +from peft import LoraConfig, get_peft_model, PeftModel import transformers from datasets import load_dataset @@ -27,6 +33,75 @@ def print_trainable_parameters(model): f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" ) +def convert_hf_weight_name(name): + return ( + name.replace(".", "_") + .replace("self_attn", "attention") + .replace("q_proj", "wq") + .replace("k_proj", "wk") + .replace("v_proj", "wv") + .replace("o_proj", "wo") + .replace("mlp", "feed_forward") + .replace("gate_proj", "w1") + .replace("down_proj", "w2") + .replace("up_proj", "w3") + .replace("input_layernorm", "attention_norm") + .replace("post_attention_layernorm", "ffn_norm") + .replace("embed_tokens", "tok_embeddings") + .replace("lm_head", "output") + .replace("model_", "") + .replace("base_", "") + .replace("default_", "") + ) + +def peft_backward_hook(module, grad_input, grad_output): + if len(grad_input) == 0 or len(grad_output) == 0: + return + assert(module.name is not None and module.bwd_step is not None) + name = module.name.replace("base_model.model.model.", "") + print(f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}") + print("Backward GRAD Input:") + for i,gi in enumerate(grad_input): + if type(gi) == torch.Tensor: + print(gi.shape) + torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}") + else: + print(gi) + print("Backward GRAD Output:") + for i, go in enumerate(grad_output): + if type(go) == torch.Tensor: + print(go.shape) + torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}") + else: + print(go) + + print("===") + module.bwd_step += 1 + +def peft_forward_hook(module, input, output): + if len(input) == 0 or len(output) == 0: + return + assert(module.name is not None and module.fwd_step is not None) + name = module.name.replace("base_model.model.model.", "") + print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}") + print("Input:") + for i,inp in enumerate(input): + if type(inp) == torch.Tensor: + print(inp.shape) + torch.save(inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}") + else: + print(inp) + print("Output:") + for i, out in enumerate(output): + if type(out) == torch.Tensor: + print(out.shape) + torch.save(out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}") + else: + print(out) + #print("Forward Input/Output: ", input[0].shape, output[0].shape) + print("===") + module.fwd_step += 1 + def main(): parser = argparse.ArgumentParser() parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") @@ -37,6 +112,7 @@ def main(): parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") parser.add_argument("--output-dir", type=str, default="") parser.add_argument("--publish-peft-with-id", type=str, default="") + parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file") args = parser.parse_args() model_name = args.model_name use_full_precision=args.use_full_precision @@ -46,8 +122,9 @@ def main(): lora_dropout = args.lora_dropout output_dir = args.output_dir publish_peft_with_id = args.publish_peft_with_id - if len(output_dir) == 0 and len(publish_peft_with_id) == 0: - raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model") + save_peft_tensors = args.save_peft_tensors + # if len(output_dir) == 0 and len(publish_peft_with_id) == 0: + # raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model") # Change working dir to folder storing this script abspath = os.path.abspath(__file__) @@ -71,16 +148,18 @@ def main(): if tokenizer.pad_token is None: tokenizer.pad_token = "[PAD]" tokenizer.padding_side = "left" - + + peft_model_name = "goliaro/llama-2-7b-lora-full" + model = PeftModel.from_pretrained(model, peft_model_name) + for param in model.parameters(): param.requires_grad = False # freeze the model - train adapters later if param.ndim == 1: # cast the small parameters (e.g. layernorm) to fp32 for stability param.data = param.data.to(torch.float32) - model.gradient_checkpointing_enable() # reduce number of stored activations + #model.gradient_checkpointing_enable() # reduce number of stored activations model.enable_input_require_grads() - model.lm_head = CastOutputToFloat(model.lm_head) config = LoraConfig( @@ -89,26 +168,51 @@ def main(): #target_modules=["q_proj", "v_proj"], #target_modules=["down_proj"], target_modules=lora_target_modules, - lora_dropout=lora_dropout, + lora_dropout=0.0, bias="none", task_type="CAUSAL_LM" ) + model = get_peft_model(model, config) + print(model) print(model.named_parameters()) - model = get_peft_model(model, config) + #model = get_peft_model(model, config) print_trainable_parameters(model) - data = load_dataset("Abirate/english_quotes") + if save_peft_tensors: + shutil.rmtree("./hf_peft_tensors", ignore_errors=True) + # Check that the output folder exists + os.makedirs("./hf_peft_tensors", exist_ok=True) + # Save hidden states and gradients + for name, layer in dict(model.named_modules()).items(): + layer.name = name + layer.fwd_step = 0 + layer.bwd_step = 0 + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_hook(peft_forward_hook) + layer.register_backward_hook(peft_backward_hook) + # Save weights + for name, params in model.named_parameters(): + if "lora" in name: + torch.save(params, f"./hf_peft_tensors/{name}") + # Overwrite FF cached weight + dst_folder = f"/home/ubuntu/.cache/flexflow/weights/{peft_model_name}/full-precision" + assert(os.path.exists(dst_folder)) + ff_w_name = convert_hf_weight_name(name) + print(f"{dst_folder}/{ff_w_name}") + params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}") + + data = load_dataset("/home/ubuntu/english_quotes") data = data.map(lambda samples: tokenizer(samples['quote']), batched=True) trainer = transformers.Trainer( model=model, train_dataset=data['train'], args=transformers.TrainingArguments( - per_device_train_batch_size=4, - gradient_accumulation_steps=4, - warmup_steps=100, - max_steps=200, + per_device_train_batch_size=1, + gradient_accumulation_steps=1, + warmup_steps=0, + max_steps=1, learning_rate=2e-4, fp16=True if not use_full_precision else False, logging_steps=1, @@ -117,15 +221,21 @@ def main(): data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) ) model.config.use_cache = False # silence the warnings. Please re-enable for inference! + + for batch in trainer.get_train_dataloader(): + print("First batch: ") + print(batch) + break + trainer.train() - if len(output_dir) > 0: - print(f"Done fine-tuning! Saving the model to {output_dir}...") - model.save_pretrained(output_dir) + # if len(output_dir) > 0: + # print(f"Done fine-tuning! Saving the model to {output_dir}...") + # model.save_pretrained(output_dir) - if len(publish_peft_with_id) > 0: - print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...") - model.push_to_hub(publish_peft_with_id, use_auth_token=True) + # if len(publish_peft_with_id) > 0: + # print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...") + # model.push_to_hub(publish_peft_with_id, use_auth_token=True) if __name__ == "__main__": main() \ No newline at end of file From 87fbadae3e69e29607a4e3f768514ae961dc013b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 15 Nov 2023 02:48:54 +0000 Subject: [PATCH 087/198] bug fix --- tests/peft/hf_finetune.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 3fe01db283..067178808c 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -64,14 +64,14 @@ def peft_backward_hook(module, grad_input, grad_output): for i,gi in enumerate(grad_input): if type(gi) == torch.Tensor: print(gi.shape) - torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}") + torch.save(gi, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}") else: print(gi) print("Backward GRAD Output:") for i, go in enumerate(grad_output): if type(go) == torch.Tensor: print(go.shape) - torch.save(grad_output, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}") + torch.save(go, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}") else: print(go) @@ -201,6 +201,8 @@ def main(): ff_w_name = convert_hf_weight_name(name) print(f"{dst_folder}/{ff_w_name}") params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}") + if "lm_head" in name: + torch.save(params, f"./hf_peft_tensors/{name}") data = load_dataset("/home/ubuntu/english_quotes") data = data.map(lambda samples: tokenizer(samples['quote']), batched=True) From 52759bdc1e127ef842fb2fbb7f78bc75bf5d8789 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 15 Nov 2023 03:54:49 +0000 Subject: [PATCH 088/198] fix --- tests/peft/hf_finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 067178808c..6dcb692f76 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -190,7 +190,7 @@ def main(): layer.bwd_step = 0 print(f"Adding hooks to layer {layer.name}") layer.register_forward_hook(peft_forward_hook) - layer.register_backward_hook(peft_backward_hook) + layer.register_full_backward_hook(peft_backward_hook) # Save weights for name, params in model.named_parameters(): if "lora" in name: From 2a5371da46ac9034ed6d4fe2dc360295f56f1567 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 15 Nov 2023 23:06:32 +0000 Subject: [PATCH 089/198] align linear --- src/ops/kernels/linear_kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index e56c4124d6..0a2b5df06d 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -452,7 +452,6 @@ void peft_bwd_kernel(LinearMeta const *m, checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); @@ -493,6 +492,7 @@ void peft_bwd_kernel(LinearMeta const *m, // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients + DT alpha = 1.0f, beta = 0.0f; if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, @@ -507,7 +507,7 @@ void peft_bwd_kernel(LinearMeta const *m, output_grad_ptr, output_type, out_dim, - &alpha, + &beta, input_grad_ptr, input_type, in_dim, From ed0be61ad14f3ae4292c888ca7b088660880d10d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 15 Nov 2023 22:27:22 -0500 Subject: [PATCH 090/198] fix --- python/flexflow/serve/serve.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 24cf9efb30..3349809670 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -550,7 +550,12 @@ def download_hf_config(self): print(f"Creating directory {self.config_dir} (if it doesn't exist)...") print(f"Saving {self.peft_model_id} configs to file {self.config_path}...") with open(self.config_path, "w") as json_file: - json.dump(self.hf_config.to_dict(), json_file, indent=2) + class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return super().default(obj) + json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder) def __get_revision_hashes(self, peft_model_id: str): ff_revision = None From 8a0b6ea7d7cb1ac38a2ed9cfdf610638a3352fa6 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 17 Nov 2023 07:20:27 +0000 Subject: [PATCH 091/198] bwd kernel updates --- src/ops/kernels/linear_kernels.cu | 21 ++-- src/ops/kernels/residual_rms_norm_kernels.cu | 100 ++++++++++--------- 2 files changed, 62 insertions(+), 59 deletions(-) diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 0a2b5df06d..21629ec024 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -461,17 +461,18 @@ void peft_bwd_kernel(LinearMeta const *m, static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; output_grad_ptr = static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = output_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { relu_backward_kernel(m->output_type[0], diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index de84e50e29..42a8747cbf 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -128,18 +128,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; X_out[index] = X1[index] + X2[index]; sum += (static_cast(X_out[index]) * static_cast(X_out[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -147,11 +142,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); - output[index] = Y[index] * weights[index % N]; + Y[index] = static_cast(static_cast(X_out[index]) * + static_cast(rms[i])); + output[index] = static_cast(static_cast(Y[index]) * + static_cast(weights[index % N])); } } @@ -164,26 +160,17 @@ void forward_kernel(ResidualRMSNormMeta const *m, T *output_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - ResidualRMSNormFusedForwardKernel - <<>>(m->in_dim, - m->eps, - input1_ptr, - input2_ptr, - residual_output_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + <<batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>( + m->in_dim, + m->eps, + input1_ptr, + input2_ptr, + residual_output_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); } void forward_kernel_wrapper(ResidualRMSNormMeta const *m, @@ -345,16 +332,22 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, template __global__ void ComputeInternalGradientsCUDAKernel( int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { - __shared__ T ds_storage[C10_WARP_SIZE]; + __shared__ float ds_storage[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T ds = 0; + float ds = 0; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int const index = i * N + j; - ds += dY[index] * X[index] * gamma[j]; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); } - ds = BlockReduceSum(ds, ds_storage); + ds = BlockReduceSum(ds, ds_storage); if (threadIdx.x == 0) { - c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast((int)N); + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); } } @@ -370,9 +363,14 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N, const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - T dX_val = c1[i] * dY[index] * gamma[j] + c2[i] * X[index]; - dX1[index] += dX_val; - dX2[index] += dX_val; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + // dX1[index] += dX_val; + // dX2[index] += dX_val; + dX1[index] = static_cast(dX_val); + dX2[index] = static_cast(dX_val); } } @@ -452,12 +450,15 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, continue; } - const int64_t M = bc->requestsInfo[i].num_tokens_in_batch; - const int64_t N = m->num_elements; + int M = m->batch_size; // TODO: replace with + // m->requestsInfo[i].num_tokens_in_batch; + int N = m->in_dim; + T const *residual_output_rms_input_ptr = static_cast(m->input_activation); + ComputeInternalGradientsCUDAKernel - <<>>( + <<>>( N, output_grad_ptr, residual_output_rms_input_ptr, @@ -466,14 +467,15 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, static_cast(m->norm_ptr)); RMSNormBackwardCUDAKernel - <<>>(N, - output_grad_ptr, - residual_output_rms_input_ptr, - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - residual_input0_grad_ptr, - residual_input1_grad_ptr); + <<>>( + m->in_dim, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr); } } From b0e686d3273014cb2fec9f2eeea104a1c4c649fb Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 17 Nov 2023 22:18:20 +0000 Subject: [PATCH 092/198] undo use of CUBLAS_COMPUTE_32F_FAST_16F for now --- src/ops/inc_multihead_self_attention.cpp | 26 ++++---- src/ops/inc_multihead_self_attention.cu | 63 ++++++++++--------- src/ops/kernels/linear_kernels.cpp | 39 ++++++------ src/ops/kernels/linear_kernels.cu | 47 +++++++------- src/ops/kernels/lora_linear_kernels.cu | 42 +++++++------ src/ops/spec_inc_multihead_self_attention.cpp | 13 ++-- src/ops/spec_inc_multihead_self_attention.cu | 21 ++++--- src/ops/tree_inc_multihead_self_attention.cpp | 13 ++-- src/ops/tree_inc_multihead_self_attention.cu | 21 ++++--- tests/peft/hf_finetune.py | 43 +++++++++---- 10 files changed, 182 insertions(+), 146 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 4495f66844..188659bea0 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -257,13 +257,14 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// hipblasDatatype_t compute_type = hipblas_data_type; +// #else +// // TODO: currently use the hipblas_data_type +// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// hipblasDatatype_t compute_type = hipblas_data_type; +// #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads // Input: qSize x num_tokens @@ -510,13 +511,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = hipblas_data_type; -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// hipblasDatatype_t compute_type = hipblas_data_type; +// #else +// // TODO: currently use the hipblas_data_type +// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// hipblasDatatype_t compute_type = hipblas_data_type; +// #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 92a1f37097..e597c7de97 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -238,17 +238,18 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = cublas_data_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads // Input: qSize x num_tokens @@ -466,17 +467,18 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = cublas_data_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; @@ -883,17 +885,18 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = cublas_data_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 2e7ae68314..4fa8ab244f 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -274,13 +274,14 @@ void forward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = output_type; -#else - // TODO: currently use the output_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = output_type; -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// hipblasDatatype_t compute_type = output_type; +// #else +// // TODO: currently use the output_type +// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// hipblasDatatype_t compute_type = output_type; +// #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, HIPBLAS_OP_N, @@ -370,13 +371,14 @@ void peft_bwd_kernel(LinearMeta const *m, hipDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); // update input_grad_ptr offset input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens; -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = output_type; -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// hipblasDatatype_t compute_type = hipblas_data_type; +// #else +// // TODO: currently use the hipblas_data_type +// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// hipblasDatatype_t compute_type = output_type; +// #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { relu_backward_kernel(m->output_type[0], @@ -440,13 +442,14 @@ void backward_kernel(LinearMeta const *m, hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = output_type; -#else - // TODO: currently use output_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; hipblasDatatype_t compute_type = output_type; -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// hipblasDatatype_t compute_type = output_type; +// #else +// // TODO: currently use output_type +// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// hipblasDatatype_t compute_type = output_type; +// #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 21629ec024..248e59bdeb 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -365,17 +365,18 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + cudaDataType_t compute_type = output_type; +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = cublas_data_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -538,17 +539,19 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + cudaDataType_t compute_type = output_type; +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = cublas_data_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif + int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 2d271efe72..85a5d9990f 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -145,17 +145,18 @@ void inference_kernel(LoraLinearMeta *m, cudaDataType_t lr_actv_type = output_type; assert(input_type == output_type); cudaDataType_t weight_type = output_type; -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = output_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->input_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = output_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->input_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif int num_peft_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { @@ -267,17 +268,18 @@ void peft_bwd_kernel(LoraLinearMeta *m, assert(input_type == output_type); cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = output_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = output_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index 6252693d1a..d827a79c22 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -200,13 +200,14 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// hipblasDatatype_t compute_type = hipblas_data_type; +// #else +// // TODO: currently use the hipblas_data_type +// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// hipblasDatatype_t compute_type = hipblas_data_type; +// #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index fb96862b81..999492f7c3 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -215,17 +215,18 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = cublas_data_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif // int num_requests = bc->num_active_requests(); // int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 61117ce6df..d385880a74 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -157,13 +157,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// hipblasDatatype_t compute_type = hipblas_data_type; +// #else +// // TODO: currently use the hipblas_data_type +// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// hipblasDatatype_t compute_type = hipblas_data_type; +// #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 8c2ee24132..fc3d1fda72 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -158,17 +158,18 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif +// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +// cudaDataType_t compute_type = cublas_data_type; +// #else +// // For best performance, set the default cublas compute type to +// // CUBLAS_COMPUTE_16F for half precision and to +// // CUBLAS_COMPUTE_32F_FAST_16F for full precision +// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; +// if (m->output_type[0] == DT_FLOAT) { +// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; +// } +// #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 6dcb692f76..5650eff3e9 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -54,26 +54,41 @@ def convert_hf_weight_name(name): .replace("default_", "") ) +def pre_peft_backward_hook(module, grad_output): + assert (len(grad_output) == 1) + assert ("lm_head" in module.name) + name = module.name.replace("base_model.model.model.", "") + print(f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}") + print(grad_output[0].shape) + dev = grad_output[0].device + new_grad_output = torch.full(grad_output[0].shape, 0.5).to(dev) + assert(new_grad_output.shape == grad_output[0].shape) + return (new_grad_output,) + def peft_backward_hook(module, grad_input, grad_output): if len(grad_input) == 0 or len(grad_output) == 0: return assert(module.name is not None and module.bwd_step is not None) name = module.name.replace("base_model.model.model.", "") print(f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}") - print("Backward GRAD Input:") - for i,gi in enumerate(grad_input): - if type(gi) == torch.Tensor: - print(gi.shape) - torch.save(gi, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}") - else: - print(gi) print("Backward GRAD Output:") for i, go in enumerate(grad_output): if type(go) == torch.Tensor: - print(go.shape) - torch.save(go, f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}") + dst_filepath = f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.go_{i}" + print("\t", go.shape) + print(f"\t\tSaving to {dst_filepath}") + torch.save(go, dst_filepath) else: print(go) + print("Backward GRAD Input:") + for i,gi in enumerate(grad_input): + if type(gi) == torch.Tensor: + dst_filepath = f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}" + print("\t", gi.shape) + print(f"\t\tSaving to {dst_filepath}") + torch.save(gi, dst_filepath) + else: + print(gi) print("===") module.bwd_step += 1 @@ -106,7 +121,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") parser.add_argument("--lora-rank", type=int, default=16) - parser.add_argument("--lora-alpha", type=int, default=32) + parser.add_argument("--lora-alpha", type=int, default=16) parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target") parser.add_argument("--lora-dropout", type=float, default=0.05) parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") @@ -149,7 +164,8 @@ def main(): tokenizer.pad_token = "[PAD]" tokenizer.padding_side = "left" - peft_model_name = "goliaro/llama-2-7b-lora-full" + #peft_model_name = "goliaro/llama-2-7b-lora-full" + peft_model_name = "goliaro/llama-160m-lora-full" model = PeftModel.from_pretrained(model, peft_model_name) for param in model.parameters(): @@ -191,6 +207,9 @@ def main(): print(f"Adding hooks to layer {layer.name}") layer.register_forward_hook(peft_forward_hook) layer.register_full_backward_hook(peft_backward_hook) + # base_model.model.base_model.model.lm_head + if "lm_head" in name: + layer.register_full_backward_pre_hook(pre_peft_backward_hook) # Save weights for name, params in model.named_parameters(): if "lora" in name: @@ -201,7 +220,7 @@ def main(): ff_w_name = convert_hf_weight_name(name) print(f"{dst_folder}/{ff_w_name}") params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}") - if "lm_head" in name: + if "lm_head" in name or "norm" in name: torch.save(params, f"./hf_peft_tensors/{name}") data = load_dataset("/home/ubuntu/english_quotes") From 0daf2329303402c66a0f2967879a34738aeb5b30 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 19 Nov 2023 20:15:04 +0000 Subject: [PATCH 093/198] only send dataset entry once --- inference/incr_decoding/incr_decoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index c76637a62c..5375acb355 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -286,7 +286,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; //prompts.push_back(text); - dataset.push_back(std::make_pair(text, text)); + dataset.push_back(std::make_pair(text, "")); } rm->register_new_peft_request( dataset, 256 /*max_sequence_length*/, peft_model_id); From ec131c71d5bbe37eaeb464efd2c2b0a52ed7f7c8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 20 Nov 2023 13:15:45 -0500 Subject: [PATCH 094/198] update peft test scripts --- tests/peft/fine_tune.sh | 20 ++-- tests/peft/hf_finetune.py | 236 +++++++++++++++++++++----------------- tests/peft/hf_serve.py | 45 ++++++-- tests/peft/hf_train.py | 161 ++++++++++++++++++++++++++ 4 files changed, 334 insertions(+), 128 deletions(-) create mode 100644 tests/peft/hf_train.py diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh index eddb6139d0..309d87130a 100755 --- a/tests/peft/fine_tune.sh +++ b/tests/peft/fine_tune.sh @@ -5,15 +5,15 @@ set -x # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" -python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full -python hf_finetune.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half -python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full -python hf_finetune.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half +python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full +python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half +python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full +python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half -python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full -python hf_finetune.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half +python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full +python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half -python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full -python hf_finetune.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half -python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full -python hf_finetune.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half +python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full +python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half +python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full +python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 5650eff3e9..7836633b30 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -1,24 +1,34 @@ import os, sys, shutil import torch + # Reproducibility import random import numpy as np + torch.manual_seed(0) random.seed(0) np.random.seed(0) -#torch.use_deterministic_algorithms(True) +# torch.use_deterministic_algorithms(True) import torch.nn as nn -#import bitsandbytes as bnb + +# import bitsandbytes as bnb from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer import argparse -from peft import LoraConfig, get_peft_model, PeftModel +from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig import transformers -from datasets import load_dataset + +if transformers.__version__ < "4.31.0": + raise RuntimeError( + "Please update the transformers library version to 4.31.0 or above" + ) +from datasets import load_dataset, DatasetDict + class CastOutputToFloat(nn.Sequential): - def forward(self, x): + def forward(self, x): return super().forward(x).to(torch.float32) + def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. @@ -33,42 +43,26 @@ def print_trainable_parameters(model): f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" ) -def convert_hf_weight_name(name): - return ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - .replace("base_", "") - .replace("default_", "") - ) -def pre_peft_backward_hook(module, grad_output): - assert (len(grad_output) == 1) - assert ("lm_head" in module.name) +def lm_head_pre_backward_hook(module, grad_output): + # Fill grad input tensor with 0.5 to align other layers without having to align loss + assert len(grad_output) == 1 + assert "lm_head" in module.name name = module.name.replace("base_model.model.model.", "") - print(f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}") + print( + f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}" + ) print(grad_output[0].shape) dev = grad_output[0].device new_grad_output = torch.full(grad_output[0].shape, 0.5).to(dev) - assert(new_grad_output.shape == grad_output[0].shape) + assert new_grad_output.shape == grad_output[0].shape return (new_grad_output,) + def peft_backward_hook(module, grad_input, grad_output): if len(grad_input) == 0 or len(grad_output) == 0: return - assert(module.name is not None and module.bwd_step is not None) + assert module.name is not None and module.bwd_step is not None name = module.name.replace("base_model.model.model.", "") print(f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}") print("Backward GRAD Output:") @@ -81,7 +75,7 @@ def peft_backward_hook(module, grad_input, grad_output): else: print(go) print("Backward GRAD Input:") - for i,gi in enumerate(grad_input): + for i, gi in enumerate(grad_input): if type(gi) == torch.Tensor: dst_filepath = f"./hf_peft_tensors/bwd_step_{module.bwd_step}_{name}.gi_{i}" print("\t", gi.shape) @@ -89,110 +83,125 @@ def peft_backward_hook(module, grad_input, grad_output): torch.save(gi, dst_filepath) else: print(gi) - + print("===") module.bwd_step += 1 + def peft_forward_hook(module, input, output): if len(input) == 0 or len(output) == 0: return - assert(module.name is not None and module.fwd_step is not None) + assert module.name is not None and module.fwd_step is not None name = module.name.replace("base_model.model.model.", "") print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}") print("Input:") - for i,inp in enumerate(input): + for i, inp in enumerate(input): if type(inp) == torch.Tensor: print(inp.shape) - torch.save(inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}") + torch.save( + inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}" + ) else: print(inp) print("Output:") for i, out in enumerate(output): if type(out) == torch.Tensor: print(out.shape) - torch.save(out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}") + torch.save( + out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}" + ) else: print(out) - #print("Forward Input/Output: ", input[0].shape, output[0].shape) + # print("Forward Input/Output: ", input[0].shape, output[0].shape) print("===") module.fwd_step += 1 + def main(): parser = argparse.ArgumentParser() - parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") - parser.add_argument("--lora-rank", type=int, default=16) + parser.add_argument( + "--peft-model-id", type=str, default="goliaro/llama-160m-lora-full" + ) parser.add_argument("--lora-alpha", type=int, default=16) - parser.add_argument("--lora-target-modules", type=str, default="down_proj", help="Comma-separated list of layers from the base model to target") - parser.add_argument("--lora-dropout", type=float, default=0.05) - parser.add_argument("--use-full-precision", action="store_true", help="Use full precision") + parser.add_argument("--lora-dropout", type=float, default=0.0) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) parser.add_argument("--output-dir", type=str, default="") parser.add_argument("--publish-peft-with-id", type=str, default="") - parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file") + parser.add_argument( + "--save-peft-tensors", + action="store_true", + help="Save PEFT hidden states and weights to file", + ) args = parser.parse_args() - model_name = args.model_name - use_full_precision=args.use_full_precision - lora_rank = args.lora_rank + peft_model_id = args.peft_model_id + use_full_precision = args.use_full_precision lora_alpha = args.lora_alpha - lora_target_modules = args.lora_target_modules.split(",") lora_dropout = args.lora_dropout output_dir = args.output_dir publish_peft_with_id = args.publish_peft_with_id save_peft_tensors = args.save_peft_tensors - # if len(output_dir) == 0 and len(publish_peft_with_id) == 0: - # raise ValueError("Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the fine-tuned model") # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) + # Get PEFT layer, edit any configs as needed + peft_config = PeftConfig.from_pretrained(peft_model_id) + if peft_config.peft_type != "LORA": + raise ValueError(f"PEFT type {peft_config.peft_type} not supported yet") + peft_config.lora_alpha = lora_alpha + peft_config.lora_dropout = lora_dropout + peft_config.init_lora_weights = ( + False + ) # prevent HF from re-inizialing the weights randomly + model_name = peft_config.base_model_name_or_path + # Load base model, and apply the PEFT layer model = AutoModelForCausalLM.from_pretrained( model_name, - #load_in_8bit=True, - torch_dtype = torch.float32 if use_full_precision else torch.float16, - device_map='auto', + torch_dtype=torch.float32 if use_full_precision else torch.float16, + device_map="auto", ) + model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config) # Get Tokenizer hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) hf_arch = getattr(hf_config, "architectures")[0] if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": - tokenizer = LlamaTokenizer.from_pretrained(model_name, use_fast=True, torch_dtype = torch.float32 if use_full_precision else torch.float16,) + tokenizer = LlamaTokenizer.from_pretrained( + model_name, + use_fast=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) else: - tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype = torch.float32 if use_full_precision else torch.float16,) + tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) if tokenizer.pad_token is None: tokenizer.pad_token = "[PAD]" tokenizer.padding_side = "left" - - #peft_model_name = "goliaro/llama-2-7b-lora-full" - peft_model_name = "goliaro/llama-160m-lora-full" - model = PeftModel.from_pretrained(model, peft_model_name) - - for param in model.parameters(): - param.requires_grad = False # freeze the model - train adapters later + + # Freeze all layers except the LORA ones. Cast small layers to full precision for stability + for name, param in model.named_parameters(): + if "lora" not in name: + param.requires_grad = False # freeze the model - train adapters later + else: + param.requires_grad = True if param.ndim == 1: # cast the small parameters (e.g. layernorm) to fp32 for stability param.data = param.data.to(torch.float32) - - #model.gradient_checkpointing_enable() # reduce number of stored activations + if not save_peft_tensors: + model.gradient_checkpointing_enable() # reduce number of stored activations model.enable_input_require_grads() model.lm_head = CastOutputToFloat(model.lm_head) - config = LoraConfig( - r=lora_rank, - lora_alpha=lora_alpha, - #target_modules=["q_proj", "v_proj"], - #target_modules=["down_proj"], - target_modules=lora_target_modules, - lora_dropout=0.0, - bias="none", - task_type="CAUSAL_LM" - ) - model = get_peft_model(model, config) - + # Print model with PEFT print(model) - print(model.named_parameters()) - #model = get_peft_model(model, config) + for name, params in model.named_parameters(): + print(name) print_trainable_parameters(model) if save_peft_tensors: @@ -207,28 +216,34 @@ def main(): print(f"Adding hooks to layer {layer.name}") layer.register_forward_hook(peft_forward_hook) layer.register_full_backward_hook(peft_backward_hook) - # base_model.model.base_model.model.lm_head + # TODO: remove hard-coding of lm head grad input after aligning the loss if "lm_head" in name: - layer.register_full_backward_pre_hook(pre_peft_backward_hook) - # Save weights + layer.register_full_backward_pre_hook(lm_head_pre_backward_hook) + # Save any weights of interest for name, params in model.named_parameters(): if "lora" in name: torch.save(params, f"./hf_peft_tensors/{name}") - # Overwrite FF cached weight - dst_folder = f"/home/ubuntu/.cache/flexflow/weights/{peft_model_name}/full-precision" - assert(os.path.exists(dst_folder)) - ff_w_name = convert_hf_weight_name(name) - print(f"{dst_folder}/{ff_w_name}") - params.detach().cpu().numpy().tofile(f"{dst_folder}/{ff_w_name}") if "lm_head" in name or "norm" in name: torch.save(params, f"./hf_peft_tensors/{name}") - data = load_dataset("/home/ubuntu/english_quotes") - data = data.map(lambda samples: tokenizer(samples['quote']), batched=True) + # Load fine-tuning dataset + data = load_dataset("Abirate/english_quotes") + + # TODO: remove using of a single row + key_to_filter = "quote" + desired_value = "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”" + filtered_dataset_dict = DatasetDict() + for split, dataset in data.items(): + filtered_dataset = dataset.filter( + lambda example: example[key_to_filter] == desired_value + ) + filtered_dataset_dict[split] = filtered_dataset + data = filtered_dataset_dict + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) trainer = transformers.Trainer( model=model, - train_dataset=data['train'], + train_dataset=data["train"], args=transformers.TrainingArguments( per_device_train_batch_size=1, gradient_accumulation_steps=1, @@ -237,26 +252,33 @@ def main(): learning_rate=2e-4, fp16=True if not use_full_precision else False, logging_steps=1, - output_dir=os.path.join(output_dir if len(output_dir) > 0 else "./", "lora_training_logs"), + output_dir=os.path.join( + output_dir if len(output_dir) > 0 else "./", "lora_training_logs" + ), + ), + data_collator=transformers.DataCollatorForLanguageModeling( + tokenizer, mlm=False ), - data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) ) - model.config.use_cache = False # silence the warnings. Please re-enable for inference! - - for batch in trainer.get_train_dataloader(): - print("First batch: ") - print(batch) - break - + model.config.use_cache = ( + False + ) # silence the warnings. Please re-enable for inference! + + # for batch in trainer.get_train_dataloader(): + # print("First batch: ") + # print(batch) + # break + trainer.train() - # if len(output_dir) > 0: - # print(f"Done fine-tuning! Saving the model to {output_dir}...") - # model.save_pretrained(output_dir) - - # if len(publish_peft_with_id) > 0: - # print(f"Done fine-tuning! Uploading the model to HF hub with id: {publish_peft_with_id}...") - # model.push_to_hub(publish_peft_with_id, use_auth_token=True) + if len(output_dir) > 0: + print(f"Saving the model to {output_dir}...") + model.save_pretrained(output_dir) + + if len(publish_peft_with_id) > 0: + print(f"Uploading the model to HF hub with id: {publish_peft_with_id}...") + model.push_to_hub(publish_peft_with_id, use_auth_token=True) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 6e143550c8..ad1f903cfb 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -10,20 +10,30 @@ GenerationConfig, ) + def peft_pre_forward_hook(module, input): - assert(module.name is not None and module.decoding_step is not None) + assert module.name is not None and module.decoding_step is not None name = module.name.replace("base_model.model.model.", "") - print(f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}") + print( + f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}" + ) print("Pre-Input: ", input[0].shape) - torch.save(input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input") - #print("===") + torch.save( + input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input" + ) + # print("===") + def peft_post_forward_hook(module, input, output): - assert(module.name is not None and module.decoding_step is not None) + assert module.name is not None and module.decoding_step is not None name = module.name.replace("base_model.model.model.", "") - print(f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}") + print( + f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}" + ) print("Post-Input/Output: ", input[0].shape, output[0].shape) - torch.save(output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output") + torch.save( + output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output" + ) print("===") module.decoding_step += 1 @@ -36,7 +46,11 @@ def main(): ) parser.add_argument("--max-new-tokens", type=int, default=50) parser.add_argument("--do-sample", action="store_true", help="Use sampling") - parser.add_argument("--save-peft-tensors", action="store_true", help="Save PEFT hidden states and weights to file") + parser.add_argument( + "--save-peft-tensors", + action="store_true", + help="Save PEFT hidden states and weights to file", + ) args = parser.parse_args() peft_model_id = args.peft_model_id use_full_precision = args.use_full_precision @@ -76,7 +90,17 @@ def main(): generation_config.do_sample = args.do_sample # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) - + + print(model) + for name, params in model.named_parameters(): + print(name) + if ( + name + == "base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight" + ): + print(params) + assert False + # Register hooks to save tensors, if needed if save_peft_tensors: shutil.rmtree("./hf_peft_tensors") @@ -86,7 +110,7 @@ def main(): for name, params in model.named_parameters(): if "lora" in name: torch.save(params, f"./hf_peft_tensors/{name}") - #params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + # params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") # Save hidden states for name, layer in dict(model.named_modules()).items(): if "lora_A.default" in name or "lora_B.default" in name: @@ -96,7 +120,6 @@ def main(): layer.register_forward_pre_hook(peft_pre_forward_hook) layer.register_forward_hook(peft_post_forward_hook) - batch = tokenizer("Two things are infinite: ", return_tensors="pt") with torch.cuda.amp.autocast(): output_tokens = model.generate( diff --git a/tests/peft/hf_train.py b/tests/peft/hf_train.py new file mode 100644 index 0000000000..707fc9d0ae --- /dev/null +++ b/tests/peft/hf_train.py @@ -0,0 +1,161 @@ +import os, sys + +# os.environ["CUDA_VISIBLE_DEVICES"]="0" +import torch +import torch.nn as nn + +# import bitsandbytes as bnb +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer +import argparse +from peft import LoraConfig, get_peft_model +import transformers +from datasets import load_dataset + + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): + return super().forward(x).to(torch.float32) + + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") + parser.add_argument("--lora-rank", type=int, default=16) + parser.add_argument("--lora-alpha", type=int, default=32) + parser.add_argument( + "--lora-target-modules", + type=str, + default="down_proj", + help="Comma-separated list of layers from the base model to target", + ) + parser.add_argument("--lora-dropout", type=float, default=0.05) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--publish-peft-with-id", type=str, default="") + args = parser.parse_args() + model_name = args.model_name + use_full_precision = args.use_full_precision + lora_rank = args.lora_rank + lora_alpha = args.lora_alpha + lora_target_modules = args.lora_target_modules.split(",") + lora_dropout = args.lora_dropout + output_dir = args.output_dir + publish_peft_with_id = args.publish_peft_with_id + if len(output_dir) == 0 and len(publish_peft_with_id) == 0: + raise ValueError( + "Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the trained model" + ) + + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + model = AutoModelForCausalLM.from_pretrained( + model_name, + # load_in_8bit=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + device_map="auto", + ) + + # Get Tokenizer + hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + model_name, + use_fast=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = "[PAD]" + tokenizer.padding_side = "left" + + for param in model.parameters(): + param.requires_grad = False # freeze the model - train adapters later + if param.ndim == 1: + # cast the small parameters (e.g. layernorm) to fp32 for stability + param.data = param.data.to(torch.float32) + + model.gradient_checkpointing_enable() # reduce number of stored activations + model.enable_input_require_grads() + + model.lm_head = CastOutputToFloat(model.lm_head) + + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + # target_modules=["q_proj", "v_proj"], + # target_modules=["down_proj"], + target_modules=lora_target_modules, + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM", + ) + print(model) + print(model.named_parameters()) + model = get_peft_model(model, config) + print_trainable_parameters(model) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = transformers.Trainer( + model=model, + train_dataset=data["train"], + args=transformers.TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=100, + max_steps=200, + learning_rate=2e-4, + fp16=True if not use_full_precision else False, + logging_steps=1, + output_dir=os.path.join( + output_dir if len(output_dir) > 0 else "./", "lora_training_logs" + ), + ), + data_collator=transformers.DataCollatorForLanguageModeling( + tokenizer, mlm=False + ), + ) + model.config.use_cache = ( + False + ) # silence the warnings. Please re-enable for inference! + trainer.train() + + if len(output_dir) > 0: + print(f"Done training! Saving the model to {output_dir}...") + model.save_pretrained(output_dir) + + if len(publish_peft_with_id) > 0: + print( + f"Done training! Uploading the model to HF hub with id: {publish_peft_with_id}..." + ) + model.push_to_hub(publish_peft_with_id, use_auth_token=True) + + +if __name__ == "__main__": + main() From 0431c739970a5ebda5bc592f3b8b62eb5ee141e6 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 20 Nov 2023 20:43:58 +0000 Subject: [PATCH 095/198] loss --- include/flexflow/batch_config.h | 3 ++- include/flexflow/utils/cuda_helper.h | 3 ++- inference/incr_decoding/incr_decoding.cc | 2 +- src/loss_functions/loss_functions.cu | 8 ++++---- src/ops/kernels/softmax.cu | 21 ++++++++++++++++++++- src/runtime/cuda_helper.cu | 10 +++++++++- src/runtime/request_manager.cc | 13 ++++++------- 7 files changed, 44 insertions(+), 16 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 8ddcec7d53..492502ac50 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -61,7 +61,7 @@ class BatchConfig { static int const MAX_NUM_TOKENS = 1024; // Set by update - int num_tokens = 0, num_peft_tokens = 0; + int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0; struct PerRequestInfo { PerRequestInfo() { @@ -89,6 +89,7 @@ class BatchConfig { }; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; + PerTokenInfo labelsInfo[MAX_NUM_TOKENS]; bool request_completed[MAX_NUM_REQUESTS]; bool request_running[MAX_NUM_REQUESTS]; diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index 999bc27634..f6a393a9ff 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -75,8 +75,9 @@ inline int GET_BLOCKS(int const N) { return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret; } +template __global__ void - scale_kernel(float *ptr, Legion::coord_t size, float a, float b); + scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b); __global__ void ones_kernel(float *ptr, Legion::coord_t size); diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 5375acb355..c76637a62c 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -286,7 +286,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; //prompts.push_back(text); - dataset.push_back(std::make_pair(text, "")); + dataset.push_back(std::make_pair(text, text)); } rm->register_new_peft_request( dataset, 256 /*max_sequence_length*/, peft_model_id); diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index f78311980c..636ef9c4c3 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -81,7 +81,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, label_ptr, num_samples, num_classes, k); // Scale logit gradients by op->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor * k); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -100,7 +100,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_ptr, label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor); } void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( @@ -119,7 +119,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_ptr, label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor); } void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, @@ -135,7 +135,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - loss_grad_ptr, loss_grad_volume, 0, scale_factor); + loss_grad_ptr, loss_grad_volume, 0.0f, scale_factor); } }; // namespace FlexFlow diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 96d50e1ca4..0fc827319d 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -306,8 +306,18 @@ void peft_bwd_kernel(SoftmaxMeta const *m, } int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch; for (int j = 0; j < num_bwd_tokens; j++) { - token_ids[j] = bc->tokensInfo[j + tokens_previous_requests].token_id; + token_ids[j] = bc->labelsInfo[j + tokens_previous_requests].token_id; } + + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); + // ignore last token + checkCUDA(cudaMemsetAsync( + input_grad_ptr + (tokens_previous_requests + + bc->requestsInfo[i].num_tokens_in_batch - 1) * + num_classes, + 0, + num_classes * sizeof(DT), + stream)); checkCUDA(cudaMemcpyAsync(m->handle.workSpace, token_ids, sizeof(BatchConfig::TokenId) * num_bwd_tokens, @@ -323,6 +333,15 @@ void peft_bwd_kernel(SoftmaxMeta const *m, static_cast(m->handle.workSpace), num_bwd_tokens, num_classes); + // scale + scale_kernel<<>>(input_grad_ptr + + tokens_previous_requests * num_classes, + num_bwd_tokens * num_classes, + DT(0.0), + scale_factor); tokens_previous_requests += num_bwd_tokens; } diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 0de6d9bc63..935404ad42 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -36,7 +36,8 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { using FlexFlow::get_legion_stream; -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +template +__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } @@ -644,6 +645,13 @@ template __global__ void template __global__ void assign_kernel(int64_t *ptr, coord_t size, int64_t value); +template __global__ void + scale_kernel(half *ptr, coord_t size, half a, half b); +template __global__ void + scale_kernel(float *ptr, coord_t size, float a, float b); +template __global__ void + scale_kernel(double *ptr, coord_t size, double a, double b); + template __global__ void add_kernel(half *dst, half const *src, size_t size); template __global__ void diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 9e38235bbb..3a520213f5 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -660,8 +660,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(request.status != Request::COMPLETED); assert(request.max_training_steps > 0 && request.completed_training_steps < request.max_training_steps); - int num_peft_tokens = - request.dataset[0].first.size() + request.dataset[0].second.size(); + int num_peft_tokens = request.dataset[0].first.size(); + int num_peft_label_tokens = request.dataset[0].second.size(); if (num_peft_tokens + new_bc.num_active_tokens() <= get_max_tokens_per_batch()) { // The last request slot is reserved for PEFT request @@ -686,13 +686,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_peft_tokens++; } for (size_t i = 0; i < request.dataset[0].second.size(); i++) { - new_bc.tokensInfo[new_bc.num_tokens].token_id = + new_bc.labelsInfo[new_bc.num_peft_label_tokens].token_id = request.dataset[0].second[i]; - new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx; + new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index = peft_req_idx; int depth = request.dataset[0].first.size() + i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - new_bc.num_tokens++; - new_bc.num_peft_tokens++; + new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request = depth; + new_bc.num_peft_label_tokens++; } } } From 371dffdf06dc0ca62a464f890c0cf80cfc88c33d Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Mon, 20 Nov 2023 20:45:59 +0000 Subject: [PATCH 096/198] . --- inference/incr_decoding/incr_decoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index c76637a62c..2313eca385 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -285,7 +285,7 @@ void FlexFlow::top_level_task(Task const *task, std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); total_num_requests++; - //prompts.push_back(text); + // prompts.push_back(text); dataset.push_back(std::make_pair(text, text)); } rm->register_new_peft_request( From da690ff1c0c8ccccd67cad995948bca8ff5667bb Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 20 Nov 2023 22:13:47 -0500 Subject: [PATCH 097/198] update generate/request api to take both inference and fine-tuning prompts --- include/flexflow/model.h | 10 +- include/flexflow/request_manager.h | 31 ++--- include/flexflow/utils/cuda_helper.h | 3 +- inference/incr_decoding/incr_decoding.cc | 29 +++-- inference/spec_infer/spec_infer.cc | 11 +- src/c/flexflow_c.cc | 12 +- src/runtime/request_manager.cc | 140 +++++++---------------- 7 files changed, 88 insertions(+), 148 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index b4d2fe53af..7232cb3f0b 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -367,6 +367,8 @@ class AllReduce; class FusedParallelOp; class ParallelOpInfo; +struct Request; + // TODO: Move to an appropriate place /* This is used to create a type that recursively replaces value type @@ -830,13 +832,9 @@ class FFModel { // ======================================== // Inference APIs // ======================================== - GenerationResult generate(std::string const &prompts, - int max_seq_length, - PEFTModelID peft_model_id = PEFTModelID::NO_ID); + GenerationResult generate(Request const &request); - GenerationResult generate(std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id = PEFTModelID::NO_ID); + GenerationResult generate(std::vector const &request); PEFTModelID register_peft_model( LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 0aa654f9e7..8e7a829627 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -59,19 +59,21 @@ struct Request { }; enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 }; BatchConfig::RequestGuid guid; - PEFTModelID peft_model_id; - int max_sequence_length; + PEFTModelID peft_model_id = PEFTModelID::NO_ID; + int max_sequence_length = 128; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; Status status = PENDING; std::vector tokens; + std::string prompt; std::vector beam_trees; // PEFT field RequestType req_type = REQ_INFERENCE; int completed_training_steps = 0; int max_training_steps = 1; + std::vector> dataset_text; std::vector, std::vector>> dataset; @@ -119,26 +121,13 @@ class RequestManager { FFModel *get_model(int model_id); - GenerationResult - generate_incr_decoding(FFModel *model, - std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id); - GenerationResult generate_spec_infer(FFModel *model, - std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id); + GenerationResult generate_incr_decoding(FFModel *llm, + std::vector const &requests); + GenerationResult generate_spec_infer(FFModel *llm, + std::vector const &requests); GenerationResult get_generation_result(RequestGuid const &guid); - RequestGuid register_new_request(std::string const &prompt, - int max_sequence_length, - PEFTModelID peft_model_id); - RequestGuid register_new_request(std::vector const &prompt, - int max_sequence_length, - PEFTModelID peft_model_id); - RequestGuid register_new_peft_request( - std::vector> const &dataset, - int max_sequence_length, - PEFTModelID peft_model_id); + RequestGuid register_new_request(Request const &request_); + RequestGuid register_new_peft_request(Request const &request_); bool is_request_completed(RequestGuid const &guid); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index f6a393a9ff..caaa54683a 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -76,8 +76,7 @@ inline int GET_BLOCKS(int const N) { } template -__global__ void - scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b); +__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b); __global__ void ones_kernel(float *ptr, Legion::coord_t size); diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 2313eca385..01bbdc3d2b 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -279,23 +279,28 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - std::vector prompts; - std::vector> dataset; + + std::vector requests; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // // Add inference request + // Request inference_req; + // inference_req.prompt = text; + // inference_req.max_sequence_length = 128; + // inference_req.peft_model_id = peft_model_id; + // requests.push_back(inference_req); + // total_num_requests++; + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 128; + fine_tuning_req.peft_model_id = peft_model_id; + fine_tuning_req.dataset_text.push_back(std::make_pair(text, text)); + requests.push_back(fine_tuning_req); total_num_requests++; - // prompts.push_back(text); - dataset.push_back(std::make_pair(text, text)); } - rm->register_new_peft_request( - dataset, 256 /*max_sequence_length*/, peft_model_id); - // for (auto &prompt : prompts) { - // GenerationResult result = model.generate(prompt, 128 - // /*max_sequence_length*/); - // } - GenerationResult result = - model.generate(prompts, 128 /*max_sequence_length*/, peft_model_id); + GenerationResult result = model.generate(requests); } // Execution fence diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 8b0eb926d9..f6de22a376 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -393,15 +393,18 @@ void FlexFlow::top_level_task(Task const *task, /*allow_exceptions */ true, /*ignore_comments */ true); - std::vector prompts; + std::vector requests; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // Add inference request + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + requests.push_back(inference_req); total_num_requests++; - prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 128 /*max_sequence_length*/); + tree_model.generate(requests); } // Execution fence diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 80202f6f99..8f5d197eb3 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1588,10 +1588,16 @@ flexflow_generation_result_t int max_seq_length, int *output_length_and_tokens) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - std::vector prompts; + std::string const text_str(input_text); - prompts.push_back(input_text); - GenerationResult result = handle->generate(prompts, max_seq_length); + + std::vector requests; + Request inference_req; + inference_req.prompt = text_str; + inference_req.max_sequence_length = max_seq_length; + requests.push_back(inference_req); + + GenerationResult result = handle->generate(requests); DEBUG_PRINT( "[Model] generate %p %s %i", handle, text_str.c_str(), max_seq_length); assert(result.output_tokens.size() <= max_seq_length); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 3a520213f5..13e829a823 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -176,81 +176,22 @@ size_t RequestManager::get_num_ssms() { } RequestManager::RequestGuid - RequestManager::register_new_request(std::vector const &prompt, - int max_sequence_length, - PEFTModelID peft_model_id) { - const std::lock_guard lock(request_queue_mutex); - - // Add a new request - Request request; - request.status = Request::PENDING; - request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - request.peft_model_id = peft_model_id; - if (prompt.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << prompt.size() << ".\n"; - - printf("tokens size: %zu\n", request.tokens.size()); - return 0; - } else { - request.initial_len = prompt.size(); - request.tokens = prompt; - } - - if (get_num_ssms() == 0) { - std::cout << "No small speculative model registered, using incremental " - "decoding." - << std::endl; - } else { - std::cout << "Num of models: " << get_num_ssms() << std::endl; - for (int i = 0; i < get_num_ssms(); i++) { - BeamTree beam_tree = BeamTree{}; - request.beam_trees.push_back(beam_tree); - } - } - - pending_infr_request_queue.push(request); - all_requests[request.guid] = request; - - if (verbose) { - std::cout << "new req: " << request.tokens.size() << std::endl; - for (int i = 0; i < request.tokens.size(); i++) { - std::cout << i << " : " << request.tokens[i] << std::endl; - } - } - - GenerationResult gr; - gr.guid = request.guid; - gr.input_text = ""; - gr.input_tokens = prompt; - gr.output_text = ""; - gr.output_tokens = prompt; - request_generation_results[request.guid] = gr; - - return request.guid; -} - -RequestManager::RequestGuid - RequestManager::register_new_request(std::string const &prompt, - int max_sequence_length, - PEFTModelID peft_model_id) { + RequestManager::register_new_request(Request const &request_) { const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - request.peft_model_id = peft_model_id; + request.max_sequence_length = request_.max_sequence_length; + request.peft_model_id = request_.peft_model_id; if (bos_token_id >= 0 && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } - std::vector tokens = this->tokenizer_->Encode(prompt); + std::vector tokens = this->tokenizer_->Encode(request_.prompt); if (tokens.size() >= get_max_sequence_length()) { std::cout << "Warning: too many tokens in prompt, only load up to " << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; + << request_.tokens.size() << ".\n"; printf("tokens size: %zu\n", tokens.size()); return 0; @@ -286,29 +227,27 @@ RequestManager::RequestGuid GenerationResult gr; gr.guid = request.guid; - gr.input_text = prompt; + gr.input_text = request_.prompt; gr.input_tokens = request.tokens; - gr.output_text = prompt; + gr.output_text = request_.prompt; gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; return request.guid; } -RequestManager::RequestGuid RequestManager::register_new_peft_request( - std::vector> const &dataset, - int max_sequence_length, - PEFTModelID peft_model_id) { +RequestManager::RequestGuid + RequestManager::register_new_peft_request(Request const &request_) { const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - request.peft_model_id = peft_model_id; + request.max_sequence_length = request_.max_sequence_length; + request.peft_model_id = request_.peft_model_id; request.req_type = Request::REQ_FINETUNING; request.completed_training_steps = 0; request.max_training_steps = 1; // TODO: let user set this - for (auto const &sample : dataset) { + for (auto const &sample : request_.dataset_text) { std::vector input_tokens; input_tokens = this->tokenizer_->Encode(sample.first); if (bos_token_id >= 0 && model_type != ModelType::FALCON) { @@ -321,6 +260,7 @@ RequestManager::RequestGuid RequestManager::register_new_peft_request( std::cout << "Warning: too many tokens in sample, only load up to " << get_max_sequence_length() << " tokens, but got " << input_tokens.size() + output_tokens.size() << ".\n"; + return 0; } else { request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); } @@ -688,9 +628,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, for (size_t i = 0; i < request.dataset[0].second.size(); i++) { new_bc.labelsInfo[new_bc.num_peft_label_tokens].token_id = request.dataset[0].second[i]; - new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index = peft_req_idx; + new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index = + peft_req_idx; int depth = request.dataset[0].first.size() + i; - new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request = depth; + new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request = + depth; new_bc.num_peft_label_tokens++; } } @@ -2086,26 +2028,20 @@ std::vector> return merged_tree; } -GenerationResult FFModel::generate(std::string const &prompt, - int max_seq_length, - PEFTModelID peft_model_id) { - std::vector prompts; - prompts.push_back(prompt); - return generate(prompts, max_seq_length, peft_model_id); +GenerationResult FFModel::generate(Request const &request) { + std::vector requests; + requests.push_back(request); + return generate(requests); } -GenerationResult FFModel::generate(std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id) { +GenerationResult FFModel::generate(std::vector const &requests) { RequestManager *rm = RequestManager::get_request_manager(); if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding - return rm->generate_incr_decoding( - this, prompts, max_seq_length, peft_model_id); + return rm->generate_incr_decoding(this, requests); } else { // Registered SSMs: perform speculative inference - return rm->generate_spec_infer( - this, prompts, max_seq_length, peft_model_id); + return rm->generate_spec_infer(this, requests); } } @@ -2213,14 +2149,15 @@ PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, /*static*/ GenerationResult RequestManager::generate_incr_decoding( - FFModel *llm, - std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id) { + FFModel *llm, std::vector const &requests) { InferenceManager *im = InferenceManager::get_inference_manager(); RequestGuid guid; - for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id); + for (int i = 0; i < requests.size(); i++) { + if (requests.at(i).req_type == Request::REQ_INFERENCE) { + guid = register_new_request(requests.at(i)); + } else { + guid = register_new_peft_request(requests.at(i)); + } } if (guid == 0) { @@ -2230,7 +2167,8 @@ GenerationResult RequestManager::generate_incr_decoding( return GenerationResult(); } - int tokens_to_generate = max_seq_length - all_requests[guid].tokens.size(); + int tokens_to_generate = + all_requests[guid].max_sequence_length - all_requests[guid].tokens.size(); std::queue> batch_pipeline; { batch_pipeline.push(std::make_pair(last_bcf, last_irf)); } @@ -2275,13 +2213,15 @@ GenerationResult RequestManager::generate_incr_decoding( /*static*/ GenerationResult RequestManager::generate_spec_infer(FFModel *llm, - std::vector const &prompts, - int max_seq_length, - PEFTModelID peft_model_id) { + std::vector const &requests) { InferenceManager *im = InferenceManager::get_inference_manager(); RequestGuid guid; - for (int i = 0; i < prompts.size(); i++) { - guid = register_new_request(prompts.at(i), max_seq_length, peft_model_id); + for (int i = 0; i < requests.size(); i++) { + if (requests.at(i).req_type == Request::REQ_INFERENCE) { + guid = register_new_request(requests.at(i)); + } else { + guid = register_new_peft_request(requests.at(i)); + } } if (guid == 0) { std::cout From 1e5bb7202228831f469f06f469a02ab0439bfc84 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 20 Nov 2023 22:14:01 -0500 Subject: [PATCH 098/198] linting --- src/ops/inc_multihead_self_attention.cpp | 28 ++++---- src/ops/inc_multihead_self_attention.cu | 66 +++++++++---------- src/ops/kernels/linear_kernels.cpp | 42 ++++++------ src/ops/kernels/linear_kernels.cu | 44 ++++++------- src/ops/kernels/lora_linear_kernels.cu | 44 ++++++------- src/ops/spec_inc_multihead_self_attention.cpp | 14 ++-- src/ops/spec_inc_multihead_self_attention.cu | 22 +++---- src/ops/tree_inc_multihead_self_attention.cpp | 14 ++-- src/ops/tree_inc_multihead_self_attention.cu | 22 +++---- 9 files changed, 148 insertions(+), 148 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 188659bea0..d38f93558e 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -258,13 +258,13 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(m->qSize == m->vSize && m->qSize == m->kSize); hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); hipblasDatatype_t compute_type = hipblas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// hipblasDatatype_t compute_type = hipblas_data_type; -// #else -// // TODO: currently use the hipblas_data_type -// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// hipblasDatatype_t compute_type = hipblas_data_type; -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads // Input: qSize x num_tokens @@ -512,13 +512,13 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); hipblasDatatype_t compute_type = hipblas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// hipblasDatatype_t compute_type = hipblas_data_type; -// #else -// // TODO: currently use the hipblas_data_type -// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// hipblasDatatype_t compute_type = hipblas_data_type; -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index e597c7de97..54713769a0 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -239,17 +239,17 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudaDataType_t compute_type = cublas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = cublas_data_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) // Weights: qSize x qProjSize x 3 x num_q_heads // Input: qSize x num_tokens @@ -468,17 +468,17 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); cudaDataType_t compute_type = cublas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = cublas_data_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; @@ -886,17 +886,17 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m, cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); cudaDataType_t compute_type = cublas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = cublas_data_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 4fa8ab244f..40533805d3 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -275,13 +275,13 @@ void forward_kernel(LinearMeta const *m, hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); hipblasDatatype_t compute_type = output_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// hipblasDatatype_t compute_type = output_type; -// #else -// // TODO: currently use the output_type -// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// hipblasDatatype_t compute_type = output_type; -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = output_type; + // #else + // // TODO: currently use the output_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = output_type; + // #endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, HIPBLAS_OP_N, @@ -372,13 +372,13 @@ void peft_bwd_kernel(LinearMeta const *m, // update input_grad_ptr offset input_grad_ptr = static_cast
(input_grad_ptr) + num_infr_tokens; hipblasDatatype_t compute_type = hipblas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// hipblasDatatype_t compute_type = hipblas_data_type; -// #else -// // TODO: currently use the hipblas_data_type -// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// hipblasDatatype_t compute_type = output_type; -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = output_type; + // #endif int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { relu_backward_kernel(m->output_type[0], @@ -443,13 +443,13 @@ void backward_kernel(LinearMeta const *m, hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); hipblasDatatype_t compute_type = output_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// hipblasDatatype_t compute_type = output_type; -// #else -// // TODO: currently use output_type -// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// hipblasDatatype_t compute_type = output_type; -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = output_type; + // #else + // // TODO: currently use output_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = output_type; + // #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 248e59bdeb..b41f5b3213 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -366,17 +366,17 @@ void forward_kernel(LinearMeta const *m, cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); cudaDataType_t compute_type = output_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = cublas_data_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -540,17 +540,17 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); cudaDataType_t compute_type = output_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = cublas_data_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 85a5d9990f..7be949a0d3 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -146,17 +146,17 @@ void inference_kernel(LoraLinearMeta *m, assert(input_type == output_type); cudaDataType_t weight_type = output_type; cudaDataType_t compute_type = output_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = output_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->input_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->input_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif int num_peft_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { @@ -269,17 +269,17 @@ void peft_bwd_kernel(LoraLinearMeta *m, cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; cudaDataType_t compute_type = output_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = output_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index d827a79c22..aebd5e8892 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -201,13 +201,13 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); hipblasDatatype_t compute_type = hipblas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// hipblasDatatype_t compute_type = hipblas_data_type; -// #else -// // TODO: currently use the hipblas_data_type -// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// hipblasDatatype_t compute_type = hipblas_data_type; -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 999492f7c3..10c544f2a9 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -216,17 +216,17 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); cudaDataType_t compute_type = cublas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = cublas_data_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); // int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index d385880a74..03e0ac6441 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -158,13 +158,13 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); hipblasDatatype_t compute_type = hipblas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// hipblasDatatype_t compute_type = hipblas_data_type; -// #else -// // TODO: currently use the hipblas_data_type -// // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// hipblasDatatype_t compute_type = hipblas_data_type; -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index fc3d1fda72..6b38f99b87 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -159,17 +159,17 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); cudaDataType_t compute_type = cublas_data_type; -// #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -// cudaDataType_t compute_type = cublas_data_type; -// #else -// // For best performance, set the default cublas compute type to -// // CUBLAS_COMPUTE_16F for half precision and to -// // CUBLAS_COMPUTE_32F_FAST_16F for full precision -// cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -// if (m->output_type[0] == DT_FLOAT) { -// compute_type = CUBLAS_COMPUTE_32F_FAST_16F; -// } -// #endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = From f3ff40b49abdbc802810224dc57377fd6d5c06be Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 20 Nov 2023 22:40:08 -0500 Subject: [PATCH 099/198] alignment fixes in lora & linear layer --- include/flexflow/operator.h | 41 +++++++++++++++--------- inference/incr_decoding/incr_decoding.cc | 2 +- src/ops/kernels/linear_kernels.cu | 5 +-- src/ops/kernels/lora_linear_kernels.cu | 12 +++---- src/ops/linear.cc | 6 ++++ src/ops/sigmoid_silu_multi.cu | 7 ++-- 6 files changed, 46 insertions(+), 27 deletions(-) diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index af39412232..e3f28756ec 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -243,6 +243,20 @@ class Op { } virtual void print_layer(FFModel const &model) = 0; template + static std::string get_op_name_without_uid(OpMetaType *m) { + std::string op_name_without_uid = std::string(m->op_name); + size_t last_underscore = op_name_without_uid.length() - 1; + for (int i = op_name_without_uid.length() - 1; i > 0; i--) { + if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { + break; + } else if (m->op_name[i] == '_') { + last_underscore = i; + } + } + op_name_without_uid.erase(last_underscore); + return op_name_without_uid; + } + template static void save_inference_tensors_to_file( OpMetaType *m, int shard_id, @@ -250,7 +264,8 @@ class Op { std::vector input_tensors, std::vector weight_tensors, std::vector output_tensors, - bool fwd_pass = true) { + bool fwd_pass = true, + bool before_kernel = false) { // Check if output directory exists, and create it if it does not char const *folder_path = "./inference_tensors"; struct stat st = {0}; @@ -259,16 +274,7 @@ class Op { mkdir(folder_path, 0700); } // output base filepath, shared by all tensors from the same operator - std::string op_name_without_uid = std::string(m->op_name); - size_t last_underscore = op_name_without_uid.length() - 1; - for (int i = op_name_without_uid.length() - 1; i > 0; i--) { - if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { - break; - } else if (m->op_name[i] == '_') { - last_underscore = i; - } - } - op_name_without_uid.erase(last_underscore); + std::string op_name_without_uid = get_op_name_without_uid(m); std::string base_filepath = "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + (fwd_pass ? "_decoding-step_" : "_bwd-step_") + @@ -277,6 +283,9 @@ class Op { "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + op_name_without_uid + "_shard-id_" + std::to_string(shard_id); + if (before_kernel) { + base_filepath += "_pre"; + } // save batch config, if passed if (bc != nullptr) { bc->save_to_file(base_filepath + "_batch-config"); @@ -353,10 +362,12 @@ class Op { } } // increase count of decoding steps - if (fwd_pass) { - m->decoding_step++; - } else { - m->bwd_step++; + if (!before_kernel) { + if (fwd_pass) { + m->decoding_step++; + } else { + m->bwd_step++; + } } } virtual bool measure_operator_cost(Simulator *sim, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 01bbdc3d2b..cf92e6834a 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -296,7 +296,7 @@ void FlexFlow::top_level_task(Task const *task, fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; fine_tuning_req.max_sequence_length = 128; fine_tuning_req.peft_model_id = peft_model_id; - fine_tuning_req.dataset_text.push_back(std::make_pair(text, text)); + fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); requests.push_back(fine_tuning_req); total_num_requests++; } diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index b41f5b3213..8cf5db3f11 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -493,8 +493,9 @@ void peft_bwd_kernel(LinearMeta const *m, } // Compute data gradient - // NOTE: we use alpha=1 for input_grad to accumulate gradients - DT alpha = 1.0f, beta = 0.0f; + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 7be949a0d3..8fb502bf10 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -262,7 +262,6 @@ void peft_bwd_kernel(LoraLinearMeta *m, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == output_type); @@ -300,7 +299,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, m->model_weights[bc->requestsInfo[i].peft_model_id]; int rank = weight.rank; // Compute w1's gradient - // NOTE: we use alpha=1 for w1_grad to accumulate gradients + DT alpha = 1.0f, beta = 0.0f; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -314,7 +313,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, output_grad_ptr, output_type, out_dim, - &alpha, + &beta, weight.w1_grad_ptr, weight_type, rank, @@ -322,7 +321,6 @@ void peft_bwd_kernel(LoraLinearMeta *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Compute gradients w.r.t. low_rank activation // and save the results to low_rank_activation - // NOTE: we use alpha=1 for input_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_N, @@ -336,14 +334,13 @@ void peft_bwd_kernel(LoraLinearMeta *m, output_grad_ptr, output_type, out_dim, - &alpha, + &beta, m->low_rank_activation, lr_actv_type, rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Compute w0's gradient - // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, @@ -364,8 +361,9 @@ void peft_bwd_kernel(LoraLinearMeta *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // Compute input gradient - // NOTE: we use alpha=1 for input_grad to accumulate gradients + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed if (input_grad_ptr != nullptr) { + beta = m->reset_input_grads[0] ? 0.0f : 1.0f; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_N, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 0887b6d35b..fa74e22fc6 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -757,6 +757,12 @@ void Linear::peft_bwd_task(Task const *task, int num_infr_tokens = bc->num_active_infr_tokens(); int num_peft_tokens = bc->num_active_peft_tokens(); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Linear::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false, true); + } peft_bwd_kernel_wrapper(m, input_grad.ptr, output_grad.ptr, diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index bb78973f70..60eb699496 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -57,9 +57,12 @@ __global__ void SigmoidSiluMultiBackwardKernel(int num_elements, sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; - input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val); + // input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * + // T(sigmoid_val); + input2_grad_ptr[i] = output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val); - input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + // input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val); T sig_grad = ss_grad_val * input1_ptr[i]; float x1_grad_val = static_cast(sig_grad); From 7efd3a7ce7b708154a739f6648293609e1049c21 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 21 Nov 2023 00:46:43 -0500 Subject: [PATCH 100/198] alignment fix --- src/ops/kernels/lora_linear_kernels.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 8fb502bf10..9cd5d2ecfa 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -292,7 +292,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) != m->model_weights.end()); LoraLinearWeight weight = @@ -354,7 +354,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, m->low_rank_activation, lr_actv_type, rank, - &alpha, + &beta, weight.w0_grad_ptr, weight_type, in_dim, @@ -377,7 +377,7 @@ void peft_bwd_kernel(LoraLinearMeta *m, m->low_rank_activation, lr_actv_type, rank, - &alpha, + &beta, input_grad_ptr, input_type, in_dim, From b6fe334c4364851b4dbf89c981973e454d802d88 Mon Sep 17 00:00:00 2001 From: xinhaoc Date: Wed, 22 Nov 2023 05:20:08 +0000 Subject: [PATCH 101/198] diagonal --- src/ops/inc_multihead_self_attention.cu | 48 ++++++++++++++++--------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 54713769a0..28b94fe805 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -223,6 +223,23 @@ __global__ void } } +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -658,6 +675,20 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->qk_tensor, m->qk_prods)); // TODO: fill all elements above diagonal to force causal attention + size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + fill_entries_above_diagonal<<>>(static_cast
(m->qk_prods), + num_tokens, + num_tokens, + m->num_q_heads, + entries_above_diagonal, + DT(0.0f)); + } } // Step 5: compute gradients w.r.t. key { @@ -855,23 +886,6 @@ __global__ void store_query_cache(DT const *devQKVProjArray, } } -template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; - } -} - template void compute_attention_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, From bcf8b1930f165901855b345737b54bb3b9da83f3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 22 Nov 2023 16:44:18 -0500 Subject: [PATCH 102/198] fix --- inference/incr_decoding/incr_decoding.cc | 28 ++++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index cf92e6834a..f1a51aa670 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -284,21 +284,21 @@ void FlexFlow::top_level_task(Task const *task, for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - // // Add inference request - // Request inference_req; - // inference_req.prompt = text; - // inference_req.max_sequence_length = 128; - // inference_req.peft_model_id = peft_model_id; - // requests.push_back(inference_req); - // total_num_requests++; - // Add fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 128; - fine_tuning_req.peft_model_id = peft_model_id; - fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); - requests.push_back(fine_tuning_req); + // Add inference request + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + inference_req.peft_model_id = peft_model_id; + requests.push_back(inference_req); total_num_requests++; + // Add fine-tuning request + // Request fine_tuning_req; + // fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; + // fine_tuning_req.max_sequence_length = 128; + // fine_tuning_req.peft_model_id = peft_model_id; + // fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); + // requests.push_back(fine_tuning_req); + // total_num_requests++; } GenerationResult result = model.generate(requests); } From 4bfee967f4c19b3427c2db5928baa66570dcca75 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 22 Nov 2023 17:20:45 -0500 Subject: [PATCH 103/198] alignment fix ssm --- src/ops/sigmoid_silu_multi.cu | 41 ++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 60eb699496..21940fd7d0 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -51,23 +51,30 @@ __global__ void SigmoidSiluMultiBackwardKernel(int num_elements, T const *input1_ptr, T const *input2_ptr, T *input1_grad_ptr, - T *input2_grad_ptr) { + T *input2_grad_ptr, + bool reset_input_grad1, + bool reset_input_grad2) { CUDA_KERNEL_LOOP(i, num_elements) { float sigmoid_val = static_cast(input1_ptr[i]); sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + if (reset_input_grad2) { + input2_grad_ptr[i] = + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } else { + input2_grad_ptr[i] += + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; - // input2_grad_ptr[i] += output_grad_ptr[i] * input1_ptr[i] * - // T(sigmoid_val); - input2_grad_ptr[i] = output_grad_ptr[i] * input1_ptr[i] * T(sigmoid_val); - - // input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); - input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val); + if (reset_input_grad1) { + input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val); + } else { + input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + } T sig_grad = ss_grad_val * input1_ptr[i]; float x1_grad_val = static_cast(sig_grad); - x1_grad_val = exp(-x1_grad_val) / - ((1.0f + exp(-sigmoid_val)) * (1.0f + exp(-sigmoid_val))); + x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val); input1_grad_ptr[i] += T(x1_grad_val); } } @@ -226,7 +233,9 @@ void SigmoidSiluMulti::backward_kernel_wrapper( input1.get_float_ptr(), input2.get_float_ptr(), input1_grad.get_float_ptr(), - input1_grad.get_float_ptr()); + input1_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); } else if (m->input_type[0] == DT_HALF) { SigmoidSiluMultiBackwardKernel<<reset_input_grads[0], + m->reset_input_grads[1]); } else { assert(false && "unsupport datatype in SigmoidSiluMulti"); } @@ -307,7 +318,9 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper( static_cast(m->input_activation) + num_peft_tokens * in_dim, input1_grad.get_float_ptr(), - input1_grad.get_float_ptr()); + input1_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); } else if (m->input_type[0] == DT_HALF) { SigmoidSiluMultiBackwardKernel<<(m->input_activation) + num_peft_tokens * in_dim, input1_grad.get_half_ptr(), - input2_grad.get_half_ptr()); + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); } else { assert(false && "unsupport datatype in SigmoidSiluMulti"); } From efd19769d7d6734aadfb2ba2ddd15caed01a008e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 24 Nov 2023 09:44:51 -0500 Subject: [PATCH 104/198] sigmoid-silu-multi now fully aligned --- src/ops/sigmoid_silu_multi.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 21940fd7d0..ec88042a1d 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -272,9 +272,8 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper( cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - int num_elements = output_grad.domain.get_volume(); - assert(input1_grad.domain.get_volume() == num_elements); - assert(input2_grad.domain.get_volume() == num_elements); + assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume()); + assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume()); cudaEvent_t t_start, t_end; if (m->profiling) { @@ -306,19 +305,20 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper( assert(num_peft_tokens >= 1); } int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + int num_elements = in_dim * num_peft_tokens; if (m->input_type[0] == DT_FLOAT) { SigmoidSiluMultiBackwardKernel<<>>( - output_grad.domain.get_volume(), + num_elements, output_grad.get_float_ptr(), static_cast(m->input_activation), static_cast(m->input_activation) + num_peft_tokens * in_dim, input1_grad.get_float_ptr(), - input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), m->reset_input_grads[0], m->reset_input_grads[1]); } else if (m->input_type[0] == DT_HALF) { @@ -326,7 +326,7 @@ void SigmoidSiluMulti::peft_bwd_kernel_wrapper( min(CUDA_NUM_THREADS, num_elements), 0, stream>>>( - output_grad.domain.get_volume(), + num_elements, output_grad.get_half_ptr(), static_cast(m->input_activation), static_cast(m->input_activation) + From 7ae195ac12baf62c82ce81d7872a485c4f867618 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 24 Nov 2023 11:49:44 -0500 Subject: [PATCH 105/198] rms norm kernel updates --- .../ops/kernels/residual_rms_norm_kernels.h | 3 - .../flexflow/ops/kernels/rms_norm_kernels.h | 4 - src/ops/kernels/residual_rms_norm_kernels.cu | 92 +++++----- src/ops/kernels/rms_norm_kernels.cu | 161 ++++++++---------- 4 files changed, 108 insertions(+), 152 deletions(-) diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 3091f83675..691f8ef8c1 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -32,9 +32,6 @@ class ResidualRMSNormMeta : public OpMeta { void *rms_ptr; void *norm_ptr; - float alpha; - float beta; - int in_dim; int batch_size; int num_elements; diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 92e5e04af3..46297764ec 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -31,10 +31,6 @@ class RMSNormMeta : public OpMeta { float eps; void *rms_ptr; void *norm_ptr; - void *c2_ptr; - - float alpha; - float beta; int in_dim; int batch_size; diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 42a8747cbf..9ffbf1b3ba 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -24,16 +24,12 @@ namespace FlexFlow { using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, ResidualRMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; @@ -96,25 +92,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - template __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, float eps, @@ -359,7 +336,9 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N, T const *c1, T const *c2, T *dX1, - T *dX2) { + T *dX2, + bool reset_input_grad1, + bool reset_input_grad2) { const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; @@ -367,10 +346,16 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N, static_cast(c1[i]) * static_cast(dY[index]) * static_cast(gamma[j]) + static_cast(c2[i]) * static_cast(X[index]); - // dX1[index] += dX_val; - // dX2[index] += dX_val; - dX1[index] = static_cast(dX_val); - dX2[index] = static_cast(dX_val); + if (reset_input_grad1) { + dX1[index] = static_cast(dX_val); + } else { + dX1[index] += dX_val; + } + if (reset_input_grad2) { + dX2[index] = static_cast(dX_val); + } else { + dX2[index] += dX_val; + } } } @@ -399,10 +384,10 @@ void backward_kernel(ResidualRMSNormMeta const *m, T const *weight_ptr, T *weight_grad_ptr, cudaStream_t stream) { - const int64_t M = m->batch_size; - const int64_t N = m->num_elements; + int M = m->batch_size; + int N = m->in_dim; ComputeInternalGradientsCUDAKernel - <<>>( + <<>>( N, output_grad_ptr, residual_output_rms_input_ptr, @@ -410,23 +395,25 @@ void backward_kernel(ResidualRMSNormMeta const *m, static_cast(m->rms_ptr), static_cast(m->norm_ptr)); - RMSNormBackwardCUDAKernel - <<>>(N, - output_grad_ptr, - residual_output_rms_input_ptr, - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - residual_input0_grad_ptr, - residual_input1_grad_ptr); - const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; - GammaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - residual_output_rms_input_ptr, - static_cast(m->rms_ptr), - weight_grad_ptr); + RMSNormBackwardCUDAKernel<<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + residual_output_rms_input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); } template @@ -450,8 +437,7 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, continue; } - int M = m->batch_size; // TODO: replace with - // m->requestsInfo[i].num_tokens_in_batch; + int M = bc->requestsInfo[i].num_tokens_in_batch; int N = m->in_dim; T const *residual_output_rms_input_ptr = @@ -468,14 +454,16 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, RMSNormBackwardCUDAKernel <<>>( - m->in_dim, + N, output_grad_ptr, residual_output_rms_input_ptr, weight_ptr, static_cast(m->rms_ptr), static_cast(m->norm_ptr), residual_input0_grad_ptr, - residual_input1_grad_ptr); + residual_input1_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); } } diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index ae6a5d590d..d0702d651e 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -24,16 +24,12 @@ namespace FlexFlow { using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; @@ -41,15 +37,11 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, DataType data_type = rms->weights[0]->data_type; size_t rms_ptr_size = batch_size; - size_t c2_ptr_size = rms_ptr_size; size_t norm_ptr_size = num_elements; - size_t totalSize = - (rms_ptr_size + c2_ptr_size + norm_ptr_size) * data_type_size(data_type); + size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); rms_ptr = gpu_mem_allocator.allocate_instance_untyped( rms_ptr_size * data_type_size(data_type)); - c2_ptr = gpu_mem_allocator.allocate_instance_untyped( - c2_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); } @@ -100,25 +92,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - template __global__ void RMSNormFusedForwardKernel(int64_t N, float eps, @@ -130,16 +103,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -147,10 +115,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rms[i]); + Y[index] = static_cast(X[index]) * static_cast(rms[i]); output[index] = Y[index] * weights[index % N]; } } @@ -162,24 +129,15 @@ void forward_kernel(RMSNormMeta const *m, T *output_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - RMSNormFusedForwardKernel - <<>>(m->in_dim, - m->eps, - input_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + <<batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>( + m->in_dim, + m->eps, + input_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); } void forward_kernel_wrapper(RMSNormMeta const *m, @@ -326,14 +284,20 @@ __global__ void ComputeInternalGradientsCUDAKernel( int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { __shared__ T ds_storage[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - T ds = 0; + float ds = 0; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int const index = i * N + j; - ds += dY[index] * X[index] * gamma[j]; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); } ds = BlockReduceSum(ds, ds_storage); if (threadIdx.x == 0) { - c2[i] = -ds * (rrms[i] * rrms[i] * rrms[i]) / static_cast((int)N); + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); } } @@ -344,11 +308,20 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N, T const *gamma, T const *c1, T const *c2, - T *dX) { + T *dX, + bool reset_input_grad) { const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - dX[index] = c1[i] * dY[index] * gamma[j] + c2[i] * X[index]; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad) { + dX[index] = dX_val; + } else { + dX[index] += dX_val; + } } } @@ -376,33 +349,33 @@ void backward_kernel(RMSNormMeta const *m, T const *weight_ptr, T *weight_grad_ptr, cudaStream_t stream) { - const int64_t M = m->batch_size; - const int64_t N = m->num_elements; + int M = m->batch_size; + int N = m->in_dim; ComputeInternalGradientsCUDAKernel - <<>>( + <<>>( N, output_grad_ptr, input_ptr, weight_ptr, static_cast(m->rms_ptr), - static_cast(m->c2_ptr)); - - RMSNormBackwardCUDAKernel - <<>>(N, - output_grad_ptr, - input_ptr, - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->c2_ptr), - input_grad_ptr); - const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; - GammaBackwardCUDAKernel - <<>>(M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->rms_ptr), - weight_grad_ptr); + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel<<>>( + m->in_dim, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); } void backward_kernel_wrapper(RMSNormMeta const *m, @@ -475,24 +448,26 @@ void peft_bwd_kernel(RMSNormMeta const *m, continue; } - const int64_t M = bc->requestsInfo[i].num_tokens_in_batch; - const int64_t N = m->num_elements; + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->num_elements; ComputeInternalGradientsCUDAKernel - <<>>( + <<>>( N, output_grad_ptr, static_cast(m->input_activation), weight_ptr, static_cast(m->rms_ptr), - static_cast(m->c2_ptr)); - RMSNormBackwardCUDAKernel<<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - weight_ptr, - static_cast(m->rms_ptr), - static_cast(m->c2_ptr), - input_grad_ptr); + static_cast(m->norm_ptr)); + RMSNormBackwardCUDAKernel + <<>>( + m->in_dim, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); } } From 703081444ed26c3132bd20fb375a07973019198d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 26 Nov 2023 11:02:08 -0500 Subject: [PATCH 106/198] fix --- src/ops/kernels/residual_rms_norm_kernels.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 9ffbf1b3ba..b12d105c1b 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -349,12 +349,12 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N, if (reset_input_grad1) { dX1[index] = static_cast(dX_val); } else { - dX1[index] += dX_val; + dX1[index] += static_cast(dX_val); } if (reset_input_grad2) { - dX2[index] = static_cast(dX_val); + dX2[index] = static_cast(dX1[index]); } else { - dX2[index] += dX_val; + dX2[index] += static_cast(dX1[index]); } } } From eb3b6abd500931fe7027e62e3d9c618f907a4f25 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 26 Nov 2023 11:03:12 -0500 Subject: [PATCH 107/198] in-place residual rms --- include/flexflow/ops/residual_rms_norm.h | 1 + src/ops/residual_rms_norm.cc | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h index de6e6ea506..2acc06841c 100644 --- a/include/flexflow/ops/residual_rms_norm.h +++ b/include/flexflow/ops/residual_rms_norm.h @@ -32,6 +32,7 @@ class ResidualRMSNorm : public Op { ResidualRMSNorm const &other, Input const &inputs, bool allocate_weights); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index a57b9248c7..953dd60242 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -234,6 +234,18 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, } } +void ResidualRMSNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); +} + void ResidualRMSNorm::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; From a122e306351ed585b3e585e44f05a85419372269 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 27 Nov 2023 23:32:09 -0500 Subject: [PATCH 108/198] bug fix and linting --- include/flexflow/batch_config.h | 2 +- src/ops/inc_multihead_self_attention.cu | 2 +- src/ops/spec_inc_multihead_self_attention.cu | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index ed0104e05d..cc32afca84 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -65,7 +65,7 @@ class BatchConfig { int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0; // number of tokens in prompt phase, start offset of tokens in inc_decoding // phase. num_tokens - num_prompt_tokens = num_generation_tokens; - int num_generation_tokens=0; + int num_generation_tokens = 0; struct PerRequestInfo { PerRequestInfo() { diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index cd784c1a3c..ece7d47b58 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1285,7 +1285,7 @@ __global__ void store_query_cache(DT const *devQKVProjArray, } template -void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *bias_ptr, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 3ff0f5c80e..336fcb5c99 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -493,7 +493,7 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens() * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - + compute_o_prod_bias( m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } From 53e737b912c4c8368ae2aa645b4c9b19930159c3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 30 Nov 2023 16:19:18 -0500 Subject: [PATCH 109/198] align backward of o_proj, attn_heads, qk_prods_softmax, and v_proj with huggingface --- src/ops/inc_multihead_self_attention.cu | 83 +++++++++++++++++-------- 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index ece7d47b58..f5288964e9 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -894,6 +894,26 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; // } // #endif + std::string op_name_without_uid = std::string(m->op_name); + size_t last_underscore = op_name_without_uid.length() - 1; + for (int i = op_name_without_uid.length() - 1; i > 0; i--) { + if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { + break; + } else if (m->op_name[i] == '_') { + last_underscore = i; + } + } + op_name_without_uid.erase(last_underscore); + + std::string base_filepath = + "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + + "_bwd-step_" + std::to_string(m->bwd_step) + + "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + + "_layer-name_" + op_name_without_uid + "_shard-id_" + + std::to_string(shard_id); + + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; @@ -913,30 +933,31 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, int vt_block_size = m->vProjSize; int vt_req_block_size = vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); // Step 1: compute gradients before final projection { int m_ = m->vProjSize * m->num_q_heads; int n_ = num_tokens; int k_ = m->oProjSize; - int lda = k_; + int lda = m_; int ldb = k_; int ldc = m_; float alpha = 1.0f, beta = 0.0f; // matrix A: output projection weight - // matrix A's layout: [num_heads, vProjSize, oProjSize] + // matrix A's layout: [vProjSize * num_heads, oProjSize] DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + m->kProjSize * m->num_q_heads + m->vProjSize * m->num_q_heads); // matrix B: output gradients - // matrix B's layout: [num_new_tokens, oProjSize] + // matrix B's layout: [oProjSize, num_new_tokens] DT const *B = output_grad_ptr + bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; // matrix C: attn_heads gradients - // matrix C's layout: [num_new_tokens, num_heads, vProjSize] + // matrix C's layout: [vProjSize * num_heads, num_new_tokens] DT *C = static_cast
(m->handle.workSpace); checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, + CUBLAS_OP_N, CUBLAS_OP_N, m_, n_, @@ -954,33 +975,38 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + std::string filename = base_filepath + "_o_proj_in_grad"; + std::cout << "FILENAME: " << filename << std::endl; + save_tensor(C, m_*n_, filename.c_str()); } // Step 2: compute gradients w.r.t. value { float alpha = 1.0f, beta = 0.0f; - // matrix A: attn_heads gradients - // matrix A's layout: [num_tokens, num_heads, vProjSize] - DT const *A = static_cast
(m->handle.workSpace); - // matrix B: qk_prods_softmax - // matrix B's layout: [num_heads, num_tokens, num_tokens] - DT const *B = static_cast
(m->qk_prods_softmax); + // matrix A: qk_prods_softmax + // matrix A's layout: [num_new_tokens, total_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods_softmax); + // matrix B: attn_heads gradients + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->handle.workSpace); // matrix C: gradients for value (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize + - // vProjSize] - DT *C = - static_cast
(m->devQKVProjArray) + m->qProjSize + m->kProjSize; - int m_ = m->vProjSize; - int n_ = num_tokens; - int k_ = num_tokens; - int lda = m->vProjSize * m->num_q_heads; - int ldb = num_tokens; - int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); - int strideA = m->vProjSize; - int strideB = num_tokens * num_tokens; - int strideC = m->qProjSize + m->kProjSize + m->vProjSize; + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + 2*(m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients + // after transpositions + int m_ = num_tokens; // total_tokens + int n_ = m->vProjSize; // num_new_tokens + int k_ = num_tokens; // num_new_tokens + // before transpositions + int lda = num_tokens; // num_new_tokens + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // total_tokens + // N.B. strides are applied before transpose operations + int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens + int strideB = m->vProjSize; + int strideC = num_tokens * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, - CUBLAS_OP_N, + CUBLAS_OP_T, m_, n_, k_, @@ -1001,6 +1027,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + std::string filename = base_filepath + "_v_proj_in_grad"; + std::cout << "FILENAME: " << filename << std::endl; + save_tensor(C, m_*n_*m->num_q_heads, filename.c_str()); + std::string filename2 = base_filepath + "_qk_prods_softmax"; + std::cout << "FILENAME: " << filename2 << std::endl; + save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str()); } // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor { From edc02af728380c4849d99fee0277e21c97c4358e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 30 Nov 2023 16:26:03 -0500 Subject: [PATCH 110/198] cleanup --- src/ops/inc_multihead_self_attention.cu | 31 ------------------------- 1 file changed, 31 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index f5288964e9..f54cd58408 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -894,26 +894,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; // } // #endif - std::string op_name_without_uid = std::string(m->op_name); - size_t last_underscore = op_name_without_uid.length() - 1; - for (int i = op_name_without_uid.length() - 1; i > 0; i--) { - if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { - break; - } else if (m->op_name[i] == '_') { - last_underscore = i; - } - } - op_name_without_uid.erase(last_underscore); - - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_bwd-step_" + std::to_string(m->bwd_step) + - "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + - "_layer-name_" + op_name_without_uid + "_shard-id_" + - std::to_string(shard_id); - - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; @@ -975,10 +955,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // save result to file for checking - std::string filename = base_filepath + "_o_proj_in_grad"; - std::cout << "FILENAME: " << filename << std::endl; - save_tensor(C, m_*n_, filename.c_str()); } // Step 2: compute gradients w.r.t. value { @@ -1027,13 +1003,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // save result to file for checking - std::string filename = base_filepath + "_v_proj_in_grad"; - std::cout << "FILENAME: " << filename << std::endl; - save_tensor(C, m_*n_*m->num_q_heads, filename.c_str()); - std::string filename2 = base_filepath + "_qk_prods_softmax"; - std::cout << "FILENAME: " << filename2 << std::endl; - save_tensor(A, m_*k_*m->num_q_heads, filename2.c_str()); } // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor { From f00c7e0b90ce582e260596b6048577cb993bcae3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 30 Nov 2023 17:07:57 -0500 Subject: [PATCH 111/198] finished all alignment fixes in attention backward kernel --- src/ops/inc_multihead_self_attention.cu | 122 ++++++++++++------------ 1 file changed, 63 insertions(+), 59 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index f54cd58408..b5ed032137 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1007,24 +1007,27 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor { float alpha = 1.0f, beta = 0.0f; - int m_ = num_tokens; + // matrix A: attn_heads gradients + // matrix A's layout: [vProjSize * num_heads, num_new_tokens] + DT const *A = static_cast
(m->handle.workSpace); + // matrix B: value cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix C: qk_prods_softmax gradients + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + DT *C = static_cast
(m->qk_prods_softmax); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens int n_ = num_tokens; int k_ = m->vProjSize; + // before transposition and striding int lda = m->vProjSize * m->num_q_heads; int ldb = m->vProjSize * m->num_q_heads; - int ldc = num_tokens; + int ldc = num_tokens; // num_new_tokens int strideA = m->vProjSize; int strideB = m->vProjSize; - int strideC = num_tokens * num_tokens; - // matrix A: value cache - // matrix A's layout: [num_req, max_num_tokens, num_heads, vProjSize] - DT const *A = static_cast
(m->valueCache) + i * vt_req_block_size; - // matrix B: attn_heads gradients - // matrix B's layout: [num_new_tokens, num_heads, vProjSize] - DT const *B = static_cast
(m->handle.workSpace); - // matrix C: qk_prods_softmax gradients - // matrix C's layout: [num_heads, num_total_tokens, num_new_tokens] - DT *C = static_cast
(m->qk_prods_softmax); + int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -1096,27 +1099,28 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, if (*m->qk_prod_scaling) { alpha = 1.0f / sqrt(m->kProjSize); } - // matrix A: query activation (in query_activation_buffer) - // matrix A's layout: [num_tokens, num_heads, m->qProjSize] - DT const *A = static_cast
(m->query_activation_buffer); - // matrix B: gradients w.r.t. qk_prods - // matrix B's layout: [num_heads, num_tokens, num_tokens] - DT const *B = static_cast
(m->qk_prods); - // matrix C: gradients w.r.t. key (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, num_heads, qProjsize + kProjSize + - // vProjSize] - DT *C = static_cast
(m->devQKVProjArray) + m->qProjSize; - int m_ = m->kProjSize; - int n_ = num_tokens; - int k_ = num_tokens; - int lda = m->num_q_heads * m->qProjSize; - int ldb = num_tokens; - int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); - int strideA = m->qProjSize; - int strideB = num_tokens * num_tokens; - int strideC = m->qProjSize + m->kProjSize + m->vProjSize; + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: query activation (in query_activation_buffer) + // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->query_activation_buffer); + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients + // after transposition & striding + int m_ = num_tokens; + int n_ = m->kProjSize; + int k_ = num_tokens; // num_new_tokens + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->kProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->kProjSize; + int strideC = num_tokens * m->kProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, + CUBLAS_OP_T, CUBLAS_OP_T, m_, n_, @@ -1145,27 +1149,29 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, if (*m->qk_prod_scaling) { alpha = 1.0f / sqrt(m->kProjSize); } - // matrix A: key cache - // matrix A's layout: [num_tokens, num_heads, m->kProjSize] - DT const *A = static_cast
(m->keyCache) + i * kt_req_block_size; - // matrix B: gradients w.r.t. qk_prods - // matrix B's layout: [num_heads, num_tokens, num_tokens] - DT const *B = static_cast
(m->qk_prods); - // matrix C: gradients w.r.t. query (saved as part of m->devQKVProjArray) - // matrix C's layout: - // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize] - DT *C = static_cast
(m->devQKVProjArray); - int m_ = m->qProjSize; - int n_ = num_tokens; - int k_ = num_tokens; - int lda = m->kProjSize * m->num_q_heads; - int ldb = num_tokens; - int ldc = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); - int strideA = m->kProjSize; - int strideB = num_tokens * num_tokens; - int strideC = m->qProjSize + m->kProjSize + m->vProjSize; + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: key cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: gradients for query (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + // after transposition & striding + // after transposition & striding + int m_ = num_tokens; + int n_ = m->qProjSize; + int k_ = num_tokens; // num_new_tokens + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->qProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->qProjSize; + int strideC = num_tokens * m->qProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, + CUBLAS_OP_T, CUBLAS_OP_T, m_, n_, @@ -1195,26 +1201,24 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, beta = 1.0f; } // matrix A: QKV projection weights - // matrix A's layout: - // [(qProjSize + kProjSize + vProjSize) * num_q_heads, qSize] + // matrix A's layout: [qSize, qProjSize * num_q_heads, 3] DT const *A = weight_ptr; // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) - // matrix B's layout: - // [num_tokens, num_heads, qProjsize + kProjSize + vProjSize] + // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] DT const *B = static_cast
(m->devQKVProjArray); // matrix C: gradients w.r.t. input - // matrix C's layout: [num_tokens, m->qSize] + // matrix C's layout: [m->qSize, num_tokens] DT *C = input_grad_ptr + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; int m_ = m->qSize; int n_ = num_tokens; int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); int lda = m_; - int ldb = k_; + int ldb = n_; int ldc = m_; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, - CUBLAS_OP_N, + CUBLAS_OP_T, m_, n_, k_, From 3955b0bebdfdc636c0947b1373fe66213d61691f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 30 Nov 2023 17:12:47 -0500 Subject: [PATCH 112/198] fix --- src/ops/inc_multihead_self_attention.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b5ed032137..ea60a48e75 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1157,7 +1157,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; // matrix C: gradients for query (saved as part of m->devQKVProjArray) // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray) + DT *C = static_cast
(m->devQKVProjArray); // after transposition & striding // after transposition & striding int m_ = num_tokens; From c5346381bfb0489379eb6f429d066855adb62c1b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 3 Dec 2023 11:51:06 -0500 Subject: [PATCH 113/198] Update inc_multihead_self_attention.cu --- src/ops/inc_multihead_self_attention.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index ea60a48e75..89f0c1f3e7 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -967,7 +967,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *B = static_cast
(m->handle.workSpace); // matrix C: gradients for value (saved as part of m->devQKVProjArray) // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray) + 2*(m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients + DT *C = static_cast
(m->devQKVProjArray) + 2 * num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients // after transpositions int m_ = num_tokens; // total_tokens int n_ = m->vProjSize; // num_new_tokens @@ -1107,7 +1107,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *B = static_cast
(m->query_activation_buffer); // matrix C: gradients for key (saved as part of m->devQKVProjArray) // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray) + (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients + DT *C = static_cast
(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients // after transposition & striding int m_ = num_tokens; int n_ = m->kProjSize; From fd956c95a8d9c719342c3a659ca4b258cc117012 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 4 Dec 2023 01:43:18 -0500 Subject: [PATCH 114/198] Update inc_multihead_self_attention.cu --- src/ops/inc_multihead_self_attention.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 89f0c1f3e7..e273e1bb6c 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1160,9 +1160,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, DT *C = static_cast
(m->devQKVProjArray); // after transposition & striding // after transposition & striding - int m_ = num_tokens; + int m_ = num_tokens; // num_new_tokens int n_ = m->qProjSize; - int k_ = num_tokens; // num_new_tokens + int k_ = num_tokens; // before transposition and striding int lda = num_tokens; // num_new_tokens int ldb = m->qProjSize * m->num_q_heads; @@ -1171,7 +1171,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, int strideB = m->qProjSize; int strideC = num_tokens * m->qProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, + CUBLAS_OP_N, CUBLAS_OP_T, m_, n_, From 3a34c88f901e5b3271f06682f19d08e1a052baff Mon Sep 17 00:00:00 2001 From: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Date: Tue, 5 Dec 2023 23:40:18 -0500 Subject: [PATCH 115/198] use grad to store peft in/output (#1241) * use grad to store peft in/output * format * . --- src/ops/add_bias_residual_layer_norm.cc | 48 +++++++++-------------- src/ops/fused.cc | 42 ++++++++------------ src/ops/inc_multihead_self_attention.cc | 44 +++++++-------------- src/ops/layer_norm.cc | 35 ++++++----------- src/ops/linear.cc | 45 ++++++++-------------- src/ops/lora_linear.cc | 27 +++++-------- src/ops/residual_layer_norm.cc | 51 +++++++++---------------- src/ops/residual_rms_norm.cc | 43 ++++++++------------- src/ops/rms_norm.cc | 35 ++++++----------- src/ops/sigmoid_silu_multi.cc | 14 +++---- src/ops/softmax.cc | 27 +++++-------- src/parallel_ops/allreduce.cc | 24 ++++-------- src/runtime/inference_manager.cc | 6 +++ 13 files changed, 157 insertions(+), 284 deletions(-) diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 82c71f517f..1f03d566ac 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -910,50 +910,36 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, + parallel_is, TaskArgument(NULL, 0), argmap, + Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); int field_id = 0; // output_grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); // input grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); // residual grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[1]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); // attn bias grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[2]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad)); launcher.add_field(field_id++, FID_DATA); if (elementwise_affine) { // gamma - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, weights[0]->region)); launcher.add_field(field_id++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index bbd99c5986..b7dbcaccb1 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -487,45 +487,33 @@ FutureMap FusedOp::inference(FFModel const &ff, // so we transfer the maximum of them // size_t batch_config_size = // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); - IndexLauncher launcher(FUSEDOP_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); assert(inputs[i]->region != LogicalRegion::NO_REGION); - launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[i]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[i]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, batch_inputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } offset += numInputs; for (int i = 0; i < numWeights; i++) { assert(weights[i]->region != LogicalRegion::NO_REGION); - launcher.add_region_requirement(RegionRequirement(weights[i]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[i]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[i]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, weights[i]->region)); launcher.add_field(offset + i, FID_DATA); } offset += numWeights; for (int i = 0; i < numOutputs; i++) { assert(outputs[i]->region != LogicalRegion::NO_REGION); launcher.add_region_requirement( - RegionRequirement(batch_outputs[i]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[i]->region)); + RegionRequirement(batch_outputs[i]->part, 0 /*projection id*/, + WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } return runtime->execute_index_space(ctx, launcher); @@ -561,11 +549,11 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); assert(inputs[i]->region != LogicalRegion::NO_REGION); - launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part, + launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part_grad, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - batch_inputs[i]->region)); + batch_inputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); } offset += numInputs; @@ -582,11 +570,11 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, for (int i = 0; i < numOutputs; i++) { assert(outputs[i]->region != LogicalRegion::NO_REGION); launcher.add_region_requirement( - RegionRequirement(batch_outputs[i]->part, + RegionRequirement(batch_outputs[i]->part_grad, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - batch_outputs[i]->region)); + batch_outputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index b66d524303..66197b174e 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -891,42 +891,26 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd( size_t machine_view_hash = view->hash(); int idx = 0; IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, + parallel_is, TaskArgument(nullptr, 0), argmap, + Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); + launcher.add_region_requirement(RegionRequirement( + weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, + weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); if (qkv_bias || final_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_region_requirement(RegionRequirement( + weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, + weights[1]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(idx++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index ba2d43022f..915bd0d1a7 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -661,36 +661,25 @@ Legion::FutureMap size_t machine_view_hash = view->hash(); /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, parallel_is, + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); // regions[1](I/O): input_grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(2, FID_DATA); if (elementwise_affine) { // regions[2](I): gamma - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, weights[0]->region)); launcher.add_field(3, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index fa74e22fc6..13f2ae0a7a 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -688,41 +688,26 @@ FutureMap Linear::peft_bwd(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, parallel_is, + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement(RegionRequirement( + weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, + weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(2, FID_DATA); if (use_bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, weights[1]->region)); launcher.add_field(3, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 05edeab833..050349ccb7 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -577,26 +577,17 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, MachineView const *view = mv ? mv : &output_tensor->machine_view; set_argumentmap_for_inference(ff, argmap, output_tensor); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, parallel_is, + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[1]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(1, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 4bee47de6c..fe8f0094cb 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -701,53 +701,38 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, parallel_is, + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); int field_id = 0; // output_grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); // input grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); // residual grad 1 - launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[1]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); if (use_two_residuals) { // residual grad 2 - launcher.add_region_requirement(RegionRequirement(batch_inputs[2]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[2]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad)); launcher.add_field(field_id++, FID_DATA); } if (elementwise_affine) { // gamma - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, weights[0]->region)); launcher.add_field(field_id++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 953dd60242..09e6327de7 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -630,42 +630,29 @@ Legion::FutureMap MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, parallel_is, + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); // regions[0](I): RMS output_grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[1]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad)); launcher.add_field(0, FID_DATA); // regions[2](I/O): residual input grad 0 - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); // regions[3](I/O): residual input grad 1 - launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[1]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(2, FID_DATA); // regions[4](I): gamma - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, weights[0]->region)); launcher.add_field(3, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 5a8cfe8eff..b2d3d4521b 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -527,35 +527,24 @@ Legion::FutureMap MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, parallel_is, + TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); // regions[0](I): output_grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); // regions[1](I/O): input_grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); // regions[2](I): weight - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, + EXCLUSIVE, weights[0]->region)); launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index d064bd0a1c..acca39ab33 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -373,25 +373,25 @@ FutureMap machine_view_hash); launcher.add_future(bc); // output grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, - batch_outputs[0]->region)); + batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); // input 1 grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - batch_inputs[0]->region)); + batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); // input 2 grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, - batch_inputs[1]->region)); + batch_inputs[1]->region_grad)); launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 88ffec3642..d852e09b46 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -399,26 +399,17 @@ FutureMap Softmax::peft_bwd(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, parallel_is, + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 7f147dad6f..78ce807aa6 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -355,24 +355,16 @@ FutureMap AllReduce::peft_bwd(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID, batch_outputs[0]->parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, + false /*must*/, 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, + READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 81a72a5c12..39d3ecdf81 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -229,6 +229,12 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { pt_base->region.get_field_space()); pt->part = runtime->get_logical_partition( ctx, pt->region, pt_base->part.get_index_partition()); + + pt->region_grad = runtime->create_logical_region( + ctx, pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part_grad = runtime->get_logical_partition( + ctx, pt->region_grad, pt_base->part.get_index_partition()); pt->machine_view = machine_views[j]; // std::cout << "output mv: " << pt->machine_view << std::endl; Domain part_domain = From 94230d92c54c574a96a44995bc9b52e64e1a1341 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 5 Dec 2023 23:41:12 -0500 Subject: [PATCH 116/198] format --- src/ops/add_bias_residual_layer_norm.cc | 44 ++++++++++++++++------- src/ops/fused.cc | 45 ++++++++++++++--------- src/ops/inc_multihead_self_attention.cc | 42 +++++++++++++++------- src/ops/inc_multihead_self_attention.cu | 19 ++++++---- src/ops/layer_norm.cc | 33 +++++++++++------ src/ops/linear.cc | 43 +++++++++++++++------- src/ops/lora_linear.cc | 25 +++++++++---- src/ops/residual_layer_norm.cc | 47 +++++++++++++++++-------- src/ops/residual_rms_norm.cc | 40 ++++++++++++++------- src/ops/rms_norm.cc | 33 +++++++++++------ src/ops/sigmoid_silu_multi.cc | 33 +++++++++-------- src/ops/softmax.cc | 25 +++++++++---- src/parallel_ops/allreduce.cc | 22 ++++++++---- src/runtime/inference_manager.cc | 7 ++-- 14 files changed, 314 insertions(+), 144 deletions(-) diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 1f03d566ac..be7b357f23 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -910,36 +910,54 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, - parallel_is, TaskArgument(NULL, 0), argmap, - Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); int field_id = 0; // output_grad launcher.add_region_requirement( - RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad)); + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); // input grad launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); // residual grad launcher.add_region_requirement( - RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); // attn bias grad launcher.add_region_requirement( - RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad)); + RegionRequirement(batch_inputs[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[2]->region_grad)); launcher.add_field(field_id++, FID_DATA); if (elementwise_affine) { // gamma - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, weights[0]->region)); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); launcher.add_field(field_id++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/fused.cc b/src/ops/fused.cc index b7dbcaccb1..ea1c970cc5 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -487,33 +487,45 @@ FutureMap FusedOp::inference(FFModel const &ff, // so we transfer the maximum of them // size_t batch_config_size = // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); - IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, - TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(FUSEDOP_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); assert(inputs[i]->region != LogicalRegion::NO_REGION); - launcher.add_region_requirement( - RegionRequirement(batch_inputs[i]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, batch_inputs[i]->region)); + launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } offset += numInputs; for (int i = 0; i < numWeights; i++) { assert(weights[i]->region != LogicalRegion::NO_REGION); - launcher.add_region_requirement( - RegionRequirement(weights[i]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, weights[i]->region)); + launcher.add_region_requirement(RegionRequirement(weights[i]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i]->region)); launcher.add_field(offset + i, FID_DATA); } offset += numWeights; for (int i = 0; i < numOutputs; i++) { assert(outputs[i]->region != LogicalRegion::NO_REGION); launcher.add_region_requirement( - RegionRequirement(batch_outputs[i]->part, 0 /*projection id*/, - WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region)); + RegionRequirement(batch_outputs[i]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } return runtime->execute_index_space(ctx, launcher); @@ -549,11 +561,12 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); assert(inputs[i]->region != LogicalRegion::NO_REGION); - launcher.add_region_requirement(RegionRequirement(batch_inputs[i]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[i]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[i]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); } offset += numInputs; diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 66197b174e..ca6eb7c095 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -891,26 +891,44 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd( size_t machine_view_hash = view->hash(); int idx = 0; IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, - parallel_is, TaskArgument(nullptr, 0), argmap, - Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement( - weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); if (qkv_bias || final_bias) { - launcher.add_region_requirement(RegionRequirement( - weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - weights[1]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_region_requirement( + RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(idx++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index cd7cecaf91..baa24b7c00 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -967,11 +967,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *B = static_cast
(m->handle.workSpace); // matrix C: gradients for value (saved as part of m->devQKVProjArray) // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray) + 2 * num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q and K gradients + DT *C = static_cast
(m->devQKVProjArray) + + 2 * num_tokens * + (m->qProjSize * m->num_q_heads); // skip over regions reserved + // for Q and K gradients // after transpositions - int m_ = num_tokens; // total_tokens + int m_ = num_tokens; // total_tokens int n_ = m->vProjSize; // num_new_tokens - int k_ = num_tokens; // num_new_tokens + int k_ = num_tokens; // num_new_tokens // before transpositions int lda = num_tokens; // num_new_tokens int ldb = m->vProjSize * m->num_q_heads; @@ -1027,7 +1030,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, int strideA = m->vProjSize; int strideB = m->vProjSize; int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens - + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -1107,7 +1110,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *B = static_cast
(m->query_activation_buffer); // matrix C: gradients for key (saved as part of m->devQKVProjArray) // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray) + num_tokens * (m->qProjSize * m->num_q_heads); // skip over regions reserved for Q gradients + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients // after transposition & striding int m_ = num_tokens; int n_ = m->kProjSize; @@ -1162,7 +1169,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // after transposition & striding int m_ = num_tokens; // num_new_tokens int n_ = m->qProjSize; - int k_ = num_tokens; + int k_ = num_tokens; // before transposition and striding int lda = num_tokens; // num_new_tokens int ldb = m->qProjSize * m->num_q_heads; diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 915bd0d1a7..d4b5d6a543 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -661,25 +661,38 @@ Legion::FutureMap size_t machine_view_hash = view->hash(); /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); // regions[0](I): output_grad launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); // regions[1](I/O): input_grad launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(2, FID_DATA); if (elementwise_affine) { // regions[2](I): gamma - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, weights[0]->region)); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); launcher.add_field(3, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 13f2ae0a7a..e71be3bbf4 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -688,26 +688,43 @@ FutureMap Linear::peft_bwd(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement( - weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(2, FID_DATA); if (use_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, weights[1]->region)); + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); launcher.add_field(3, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 050349ccb7..9ed411397d 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -577,17 +577,28 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, MachineView const *view = mv ? mv : &output_tensor->machine_view; set_argumentmap_for_inference(ff, argmap, output_tensor); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); launcher.add_region_requirement( - RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); launcher.add_field(1, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index fe8f0094cb..c142e47e62 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -701,38 +701,57 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); int field_id = 0; // output_grad launcher.add_region_requirement( - RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad)); + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); // input grad launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); // residual grad 1 launcher.add_region_requirement( - RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); if (use_two_residuals) { // residual grad 2 launcher.add_region_requirement( - RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad)); + RegionRequirement(batch_inputs[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[2]->region_grad)); launcher.add_field(field_id++, FID_DATA); } if (elementwise_affine) { // gamma - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, weights[0]->region)); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); launcher.add_field(field_id++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 09e6327de7..28dd7e2745 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -630,29 +630,45 @@ Legion::FutureMap MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); // regions[0](I): RMS output_grad launcher.add_region_requirement( - RegionRequirement(batch_outputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[1]->region_grad)); + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[1]->region_grad)); launcher.add_field(0, FID_DATA); // regions[2](I/O): residual input grad 0 launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); // regions[3](I/O): residual input grad 1 launcher.add_region_requirement( - RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); launcher.add_field(2, FID_DATA); // regions[4](I): gamma - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, weights[0]->region)); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); launcher.add_field(3, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index b2d3d4521b..a1749d66af 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -527,24 +527,37 @@ Legion::FutureMap MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); // regions[0](I): output_grad launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); // regions[1](I/O): input_grad launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); // regions[2](I): weight - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, - EXCLUSIVE, weights[0]->region)); + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index acca39ab33..c01f47aa21 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -373,25 +373,28 @@ FutureMap machine_view_hash); launcher.add_future(bc); // output grad - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_outputs[0]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); // input 1 grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); // input 2 grad - launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[1]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index d852e09b46..23f2eb9edf 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -399,17 +399,28 @@ FutureMap Softmax::peft_bwd(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv << std::endl; */ - IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 78ce807aa6..4478a2aedc 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -355,16 +355,26 @@ FutureMap AllReduce::peft_bwd(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID, batch_outputs[0]->parallel_is, - TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, - false /*must*/, 0 /*mapper_id*/, machine_view_hash); + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); launcher.add_future(bc); launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 39d3ecdf81..4f7d0c9632 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -230,9 +230,10 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { pt->part = runtime->get_logical_partition( ctx, pt->region, pt_base->part.get_index_partition()); - pt->region_grad = runtime->create_logical_region( - ctx, pt_base->region.get_index_space(), - pt_base->region.get_field_space()); + pt->region_grad = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); pt->part_grad = runtime->get_logical_partition( ctx, pt->region_grad, pt_base->part.get_index_partition()); pt->machine_view = machine_views[j]; From b985cc9ecf8c91ef09f5f2fe27da6274c7866af7 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 6 Dec 2023 00:30:07 -0500 Subject: [PATCH 117/198] enable peft request --- inference/incr_decoding/incr_decoding.cc | 14 +++++++------- src/ops/inc_multihead_self_attention.cu | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index f1a51aa670..dcd1b5a5ab 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -292,13 +292,13 @@ void FlexFlow::top_level_task(Task const *task, requests.push_back(inference_req); total_num_requests++; // Add fine-tuning request - // Request fine_tuning_req; - // fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - // fine_tuning_req.max_sequence_length = 128; - // fine_tuning_req.peft_model_id = peft_model_id; - // fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); - // requests.push_back(fine_tuning_req); - // total_num_requests++; + Request fine_tuning_req; + fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 128; + fine_tuning_req.peft_model_id = peft_model_id; + fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); + requests.push_back(fine_tuning_req); + total_num_requests++; } GenerationResult result = model.generate(requests); } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index baa24b7c00..dec116addd 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1961,4 +1961,24 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( GenericTensorAccessorR const weight, DataType data_type, cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + cudaStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + cudaStream_t stream); }; // namespace FlexFlow From b9c392631b596db788ead74fe76d08d80a487b7c Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 6 Dec 2023 09:31:37 -0500 Subject: [PATCH 118/198] several hacks for performance measurement; some of the changes should be reverted --- inference/incr_decoding/incr_decoding.cc | 32 ++++++++++++++++-------- src/ops/argmax.cc | 5 ++++ src/runtime/request_manager.cc | 10 ++++++-- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index dcd1b5a5ab..94ccb1cabf 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -138,9 +138,9 @@ void FlexFlow::top_level_task(Task const *task, bool do_sample = false; float temperature = 0.0f; float topp = 0.0f; - int max_requests_per_batch = 8; - int max_tokens_per_batch = 128; - int max_sequence_length = 256; + int max_requests_per_batch = 2; + int max_tokens_per_batch = 300; + int max_sequence_length = 300; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -272,6 +272,7 @@ void FlexFlow::top_level_task(Task const *task, int total_num_requests = 0; { +#ifdef DEADCODE using json = nlohmann::json; std::ifstream file_handle(file_paths.prompt_file_path); assert(file_handle.good() && "Prompt file does not exist."); @@ -291,15 +292,26 @@ void FlexFlow::top_level_task(Task const *task, inference_req.peft_model_id = peft_model_id; requests.push_back(inference_req); total_num_requests++; - // Add fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 128; - fine_tuning_req.peft_model_id = peft_model_id; - fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); - requests.push_back(fine_tuning_req); + } +#endif + std::vector requests; + for (int i = 0; i < (max_requests_per_batch - 1) * 4; i++) { + Request inference_req; + inference_req.prompt = "b"; + inference_req.max_sequence_length = 40; + requests.push_back(inference_req); total_num_requests++; } + // Add a fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 256; + fine_tuning_req.max_training_steps = 256; + fine_tuning_req.peft_model_id = peft_model_id; + fine_tuning_req.dataset_text.push_back(std::make_pair("b", "")); + requests.push_back(fine_tuning_req); + total_num_requests++; + GenerationResult result = model.generate(requests); } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index cabb8b204f..dd0e2bb822 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -392,6 +392,11 @@ InferenceResult GenericTensorAccessorW parent; int batch_size = bc->num_active_infr_tokens(); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); + // Note that we free activation allocator here since argmax is the + // last operator in forward + if (m->handle.peft_activation_allocator != nullptr) { + m->handle.peft_activation_allocator->free_all(); + } InferenceResult ir; if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index cbb21e03e0..1d4a9ee47c 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -246,13 +246,17 @@ RequestManager::RequestGuid request.peft_model_id = request_.peft_model_id; request.req_type = Request::REQ_FINETUNING; request.completed_training_steps = 0; - request.max_training_steps = 1; // TODO: let user set this + request.max_training_steps = request_.max_training_steps; for (auto const &sample : request_.dataset_text) { std::vector input_tokens; input_tokens = this->tokenizer_->Encode(sample.first); if (bos_token_id >= 0 && model_type != ModelType::FALCON) { input_tokens.insert(input_tokens.begin(), bos_token_id); } + // FIXME: this is a hack, must undo + while (input_tokens.size() < 256) { + input_tokens.push_back(293); + } std::vector output_tokens = this->tokenizer_->Encode(sample.second); if (input_tokens.size() + output_tokens.size() > @@ -355,6 +359,7 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { + log_req_mgr.print("[Old BC] Num tokens: %d", old_bc.num_tokens); const std::lock_guard lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { @@ -539,7 +544,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_generation_tokens = num_generation_tokens; // Step 3: add new requests to the next batch if there is space - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + // FIXME: we reserve one slot for PEFT req now + for (int i = 0; i < BatchConfig::max_requests_per_batch() - 1; i++) { if (new_bc.request_completed[i]) { if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { From 4d5c3e0797b4755cb8a572f2cc5985ffa33a6c57 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 16 Dec 2023 10:37:27 -0500 Subject: [PATCH 119/198] Update sigmoid_silu_multi.cu --- src/ops/sigmoid_silu_multi.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index ec88042a1d..e3b6f7a69a 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -233,7 +233,7 @@ void SigmoidSiluMulti::backward_kernel_wrapper( input1.get_float_ptr(), input2.get_float_ptr(), input1_grad.get_float_ptr(), - input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), m->reset_input_grads[0], m->reset_input_grads[1]); } else if (m->input_type[0] == DT_HALF) { From 7bf863a15fc583c66f328dbe5f520b611860c212 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 18 Dec 2023 17:48:33 -0500 Subject: [PATCH 120/198] RoPE backward --- src/ops/inc_multihead_self_attention.cu | 62 ++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index dec116addd..452a8c09f6 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -492,6 +492,46 @@ __global__ void } } +template +__global__ void + apply_rotary_embedding_bwd(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int proj_size, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + // compute indexes to visit first half proj_size of each of q/k tensor. + // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd + bool q_tensor = i < (num_tokens * hidden_size / 2); + int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; + assert(hidden_size % proj_size == 0); + int num_heads = hidden_size / proj_size; + + int token_idx = real_i % num_tokens; + int idx = (real_i / num_tokens) % (proj_size / 2); + int head_idx = real_i / (num_tokens * proj_size / 2); + assert(head_idx < num_heads); + + int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size + + head_idx * num_tokens * proj_size + + idx * num_tokens + token_idx; + int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + template __global__ void fill_entries_above_diagonal(DT *matrix, size_t num_rows, @@ -1166,7 +1206,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] DT *C = static_cast
(m->devQKVProjArray); // after transposition & striding - // after transposition & striding int m_ = num_tokens; // num_new_tokens int n_ = m->qProjSize; int k_ = num_tokens; @@ -1201,7 +1240,26 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - // Step 7: compute gradients w.r.t. input + // Step 7: perform rotary position embeddings (RoPE) bwd + { + if (*m->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + apply_rotary_embedding_bwd<<>>(A, + m->complex_input, + m->token_infos, + m->qProjSize, + num_tokens, + m->hidden_size); + } + } + // Step 8: compute gradients w.r.t. input { float alpha = 1.0f, beta = 0.0f; if (!m->reset_input_grads[0]) { From 960654ed783fef09243eae666153947eaa1be404 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 Jan 2024 22:40:26 -0500 Subject: [PATCH 121/198] PEFT bug fixes and alignment (#1269) * Revert "several hacks for performance measurement; some of the changes should be reverted" This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c. * backup * backup * updates * update * backup * backup * backup * fix * cleanup * linting --- .../ops/add_bias_residual_layer_norm.h | 2 - .../flexflow/ops/kernels/softmax_kernels.h | 3 +- include/flexflow/ops/residual_layer_norm.h | 1 + inference/incr_decoding/incr_decoding.cc | 44 +- inference/models/opt.cc | 10 +- src/ops/add_bias_residual_layer_norm.cc | 27 +- src/ops/add_bias_residual_layer_norm.cu | 72 +- src/ops/fused.cc | 15 + src/ops/fused.cu | 78 +- src/ops/inc_multihead_self_attention.cc | 4 +- src/ops/kernels/softmax.cu | 13 +- src/ops/layer_norm.cu | 18 - src/ops/linear.cc | 2 +- src/ops/lora_linear.cc | 4 +- src/ops/residual_layer_norm.cc | 57 +- src/ops/residual_layer_norm.cu | 76 +- src/ops/residual_rms_norm.cc | 7 +- src/ops/softmax.cc | 18 +- src/runtime/request_manager.cc | 10 +- tests/peft/alignment_tests.ipynb | 1427 +++++++++++++++++ tests/peft/qk_prods_alignment.ipynb | 24 + 21 files changed, 1681 insertions(+), 231 deletions(-) create mode 100644 tests/peft/alignment_tests.ipynb create mode 100644 tests/peft/qk_prods_alignment.ipynb diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index 5c4a49f998..38bb825a4d 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -124,7 +124,6 @@ class AddBiasResidualLayerNorm : public Op { T const *output_grad_ptr, T *input_grad_ptr, T *residual_grad_ptr, - T *attn_bias_grad_ptr, T const *gamma_ptr, ffStream_t stream); static void @@ -132,7 +131,6 @@ class AddBiasResidualLayerNorm : public Op { GenericTensorAccessorR const &output_grad, GenericTensorAccessorW &input_grad, GenericTensorAccessorW const &residual_grad, - GenericTensorAccessorW const &attn_bias_grad, GenericTensorAccessorR const &gamma); public: diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index db5e9799e9..b3dfe4f430 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -39,7 +39,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, void inference_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad); void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index 35ddb171d4..d924132452 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -28,6 +28,7 @@ class ResidualLayerNorm : public Op { float _eps, bool allocate_weights, char const *name); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 94ccb1cabf..009cd1af45 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -138,9 +138,9 @@ void FlexFlow::top_level_task(Task const *task, bool do_sample = false; float temperature = 0.0f; float topp = 0.0f; - int max_requests_per_batch = 2; - int max_tokens_per_batch = 300; - int max_sequence_length = 300; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -272,7 +272,6 @@ void FlexFlow::top_level_task(Task const *task, int total_num_requests = 0; { -#ifdef DEADCODE using json = nlohmann::json; std::ifstream file_handle(file_paths.prompt_file_path); assert(file_handle.good() && "Prompt file does not exist."); @@ -286,32 +285,21 @@ void FlexFlow::top_level_task(Task const *task, std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); // Add inference request - Request inference_req; - inference_req.prompt = text; - inference_req.max_sequence_length = 128; - inference_req.peft_model_id = peft_model_id; - requests.push_back(inference_req); + // Request inference_req; + // inference_req.prompt = text; + // inference_req.max_sequence_length = 128; + // inference_req.peft_model_id = peft_model_id; + // requests.push_back(inference_req); + // total_num_requests++; + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 128; + fine_tuning_req.peft_model_id = peft_model_id; + fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); + requests.push_back(fine_tuning_req); total_num_requests++; } -#endif - std::vector requests; - for (int i = 0; i < (max_requests_per_batch - 1) * 4; i++) { - Request inference_req; - inference_req.prompt = "b"; - inference_req.max_sequence_length = 40; - requests.push_back(inference_req); - total_num_requests++; - } - // Add a fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 256; - fine_tuning_req.max_training_steps = 256; - fine_tuning_req.peft_model_id = peft_model_id; - fine_tuning_req.dataset_text.push_back(std::make_pair("b", "")); - requests.push_back(fine_tuning_req); - total_num_requests++; - GenerationResult result = model.generate(requests); } diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 9069aef9e1..e0e940b186 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -193,7 +193,7 @@ void OPT::create_opt_model(FFModel &ff, Tensor fc1 = ff.dense(final_norm, opt_config.ffn_dim, - AC_MODE_NONE, + AC_MODE_RELU, true, DT_NONE, nullptr, @@ -202,8 +202,7 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers_" + std::to_string(i) + "_fc1").c_str()); - Tensor activation = ff.relu(fc1, false); - fc2 = ff.dense(activation, + fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, true, @@ -216,7 +215,7 @@ void OPT::create_opt_model(FFModel &ff, std::string("layers_" + std::to_string(i) + "_fc2").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer ff.lora_linear( - activation, + fc1, fc2, OP_LORA_MLP_SECOND, std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str()); @@ -255,7 +254,8 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(softmax, /*beam_Search*/ true); } else { // output = ff.arg_top_k(lm_head, /*k=*/1, false); - output = ff.argmax(lm_head, /*beam_Search*/ false); + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.argmax(softmax, /*beam_Search*/ false); } //------------------- compile the model -------------------------------- diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index be7b357f23..88a34b7eb5 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -931,7 +931,7 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -939,25 +939,17 @@ Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); - // attn bias grad - launcher.add_region_requirement( - RegionRequirement(batch_inputs[2]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[2]->region_grad)); - launcher.add_field(field_id++, FID_DATA); if (elementwise_affine) { // gamma - launcher.add_region_requirement(RegionRequirement(weights[0]->part, + launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - weights[0]->region)); + weights[1]->region)); launcher.add_field(field_id++, FID_DATA); } return runtime->execute_index_space(ctx, launcher); @@ -1001,14 +993,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task( ctx, runtime); - GenericTensorAccessorW attn_bias_grad = - helperGetGenericTensorAccessorRW(m->weight_type[0], - regions[region_idx++], - task->regions[task_region_idx++], - FID_DATA, - ctx, - runtime); - GenericTensorAccessorR gamma; if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 6)); @@ -1020,13 +1004,12 @@ void AddBiasResidualLayerNorm::peft_bwd_task( runtime); } AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( - m, output_grad, input_grad, residual_grad, attn_bias_grad, gamma); + m, output_grad, input_grad, residual_grad, gamma); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector weights_accessors; - weights_accessors.push_back(attn_bias_grad); if (m->elementwise_affine) { weights_accessors.push_back(gamma); } diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 097ace3676..ab017ed46c 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -101,9 +101,9 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) + val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) ? shared[lid] - : 0; + : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -536,8 +536,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T const *__restrict__ rstd, T const *__restrict__ gamma, T *dX, - T *dX_residual1, - T *dX_residual2, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, int const N, T *buf) { auto const i1 = blockIdx.x; @@ -549,9 +550,7 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T const *X_i = X + i1 * N; T const *dY_i = dY + i1 * N; T *dX_i = dX + i1 * N; - T *dX_residual1_i = dX_residual1 + i1 * N; - T *dX_residual2_i = - (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + T *dX_residual_i = dX_residual + i1 * N; // vectorized reads don't improve perf, so use regular unrolling for (; l + unroll - 1 < N; l += blockDim.x * unroll) { @@ -592,10 +591,15 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, f_grad_input -= (x - mean_val) * rstd_val * stats_x2; f_grad_input -= stats_x1; f_grad_input *= term1; - dX_i[l] += f_grad_input; - dX_residual1_i[l] += f_grad_input; - if (dX_residual2 != nullptr) { - dX_residual2_i[l] += f_grad_input; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad) { + dX_residual_i[l] = f_grad_input; + } else { + dX_residual_i[l] += f_grad_input; } } } @@ -607,13 +611,24 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, T const *__restrict__ rstd, T const *__restrict__ gamma, T *dX, - T *dX_residual1, - T *dX_residual2, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, int const N) { alignas(sizeof(double)) extern __shared__ char s_data1[]; T *buf = reinterpret_cast(&s_data1); - compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual, + reset_input_grad, + reset_residual_grad, + N, + buf); } /*static*/ @@ -661,7 +676,8 @@ void AddBiasResidualLayerNorm::backward_kernel( gamma_ptr, input_grad_ptr, residual_grad_ptr, - attn_bias_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { @@ -764,29 +780,11 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel( T const *output_grad_ptr, T *input_grad_ptr, T *residual_grad_ptr, - T *attn_bias_grad_ptr, T const *gamma_ptr, cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - ComputeGradientFusedParamsCUDAKernel - <<>>(M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); @@ -799,7 +797,8 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel( gamma_ptr, input_grad_ptr, residual_grad_ptr, - attn_bias_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], N); } @@ -809,7 +808,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( GenericTensorAccessorR const &output_grad, GenericTensorAccessorW &input_grad, GenericTensorAccessorW const &residual_grad, - GenericTensorAccessorW const &attn_bias_grad, GenericTensorAccessorR const &gamma) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -825,7 +823,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( output_grad.get_float_ptr(), input_grad.get_float_ptr(), residual_grad.get_float_ptr(), - attn_bias_grad.get_float_ptr(), m->elementwise_affine ? gamma.get_float_ptr() : nullptr, stream); } else if (m->output_type[0] == DT_HALF) { @@ -833,7 +830,6 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( output_grad.get_half_ptr(), input_grad.get_half_ptr(), residual_grad.get_half_ptr(), - attn_bias_grad.get_half_ptr(), m->elementwise_affine ? gamma.get_half_ptr() : nullptr, stream); } else { diff --git a/src/ops/fused.cc b/src/ops/fused.cc index ea1c970cc5..8afd61aece 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -528,6 +528,21 @@ FutureMap FusedOp::inference(FFModel const &ff, batch_outputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } + offset += numOutputs; + // add softmax output grad + if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { + printf("operator %i is last SOFTMAX! adding output %i\n", + numOperators - 1, + numOutputs - 1); + assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[numOutputs - 1]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[numOutputs - 1]->region_grad)); + launcher.add_field(offset, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 9954a8b43a..f6bed71f6a 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -95,8 +95,11 @@ __host__ void assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); - assert((int)regions.size() == - fused->numInputs + fused->numWeights + fused->numOutputs); + bool softmax_grad_additional_region = + (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX); + assert((int)regions.size() == fused->numInputs + fused->numWeights + + fused->numOutputs + + softmax_grad_additional_region); // Domain input_domain[MAX_NUM_INPUTS]; // Domain weight_domain[MAX_NUM_WEIGHTS]; // Domain output_domain[MAX_NUM_OUTPUTS]; @@ -141,6 +144,7 @@ __host__ void ctx, runtime); } + roff += fused->numOutputs; // Assert that all meta share the same dnn/blas handler int start = 0; for (start = 0; start < fused->numOperators; start++) { @@ -625,9 +629,22 @@ __host__ void assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); + if (op == fused->numOperators - 1) { // if this is the final operator + output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO( + fused->output_data_types[fused->numOutputs - 1], + regions[roff], + task->regions[roff], + FID_DATA, + ctx, + runtime); + } SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; Kernels::Softmax::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + m, + bc, + my_input_accessor[0], + my_output_accessor[0], + output_accessor[fused->numOutputs]); break; } case OP_ALLREDUCE: { @@ -1008,7 +1025,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( m, bc, - my_output_grad_accessor[0], + my_output_grad_accessor[1], my_input_grad_accessor[0], my_input_grad_accessor[1], my_weight_accessor[0]); @@ -1078,27 +1095,20 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_weights[op] == 2); // weight + bias } } - GenericTensorAccessorR residual2; + GenericTensorAccessorW residual2; if (m->use_two_residuals) { residual2 = my_input_grad_accessor[2]; } - GenericTensorAccessorR gamma, beta; + GenericTensorAccessorR gamma; if (m->elementwise_affine) { gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; - } } - // TODO: implment me - assert(false); - // ResidualLayerNorm::inference_kernel_wrapper(m, - // my_input_accessor[0], - // my_input_accessor[1], - // residual2, - // my_output_accessor[0], - // my_output_accessor[1], - // gamma, - // beta); + ResidualLayerNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + residual2, + gamma); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { @@ -1115,31 +1125,17 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias } } - GenericTensorAccessorR gamma, beta; + GenericTensorAccessorR gamma; if (m->elementwise_affine) { gamma = my_weight_accessor[1]; - if (m->use_bias) { - beta = my_weight_accessor[2]; - } } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_grad_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); - // TODO: implement me - assert(false); - // AddBiasResidualLayerNorm::inference_kernel_wrapper( - // m, - // attn_bias_dim, - // residual_volume, - // my_input_accessor[0], - // my_output_accessor[0], - // my_output_accessor[1], - // my_input_accessor[1], - // my_weight_accessor[0], - // gamma, - // beta); + + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + gamma); break; } case OP_SIGMOID_SILU_MULTI: { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index ca6eb7c095..5d52034575 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -902,7 +902,7 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); @@ -964,7 +964,7 @@ void IncMultiHeadSelfAttention::peft_bwd_task( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR biases; if (*m->qkv_bias || *m->final_bias) { diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 0fc827319d..271a291b09 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -121,7 +121,8 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, void inference_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -138,6 +139,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_float_ptr(), num_classes, stream); + checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(), + output.get_float_ptr(), + output.domain.get_volume() * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); } else if (m->output_type[0] == DT_HALF) { Internal::inference_kernel(m, bc, @@ -145,6 +151,11 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_half_ptr(), num_classes, stream); + checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(), + output.get_half_ptr(), + output.domain.get_volume() * sizeof(half), + cudaMemcpyDeviceToDevice, + stream)); } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 6e12c53230..1d4e94d7d5 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -664,24 +664,6 @@ void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - ComputeGradientFusedParamsCUDAKernel - <<>>(M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index e71be3bbf4..15789ae2e9 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -700,7 +700,7 @@ FutureMap Linear::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 9ed411397d..e39b444af4 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -589,14 +589,14 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(1, FID_DATA); diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index c142e47e62..8563c299ab 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -117,7 +117,6 @@ void FFModel::residual_layer_norm(const Tensor input, } int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0; - Layer *ln = nullptr; Tensor casted_input = (data_type != input->data_type) ? cast(input, data_type, "type cast for residual_layer_norm") @@ -133,20 +132,20 @@ void FFModel::residual_layer_norm(const Tensor input, ? cast(residual2, data_type, "type cast for residual2_layer_norm") : residual2; } - ln = new Layer(this, - OP_RESIDUAL_LAYERNORM, - data_type, - name, - 2 + use_two_residuals /*inputs*/, - num_weights, - 2 /*outputs*/, - casted_input, - casted_residual1, - casted_residual2); + Layer *ln = new Layer(this, + OP_RESIDUAL_LAYERNORM, + data_type, + name, + 2 + use_two_residuals /*inputs*/, + num_weights, + 2 /*outputs*/, + casted_input, + casted_residual1, + casted_residual2); ln->outputs[0] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/); ln->outputs[1] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/); { int numdims = axes.size(); int dims[numdims]; @@ -326,6 +325,18 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, } } +void ResidualLayerNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); +} + void ResidualLayerNorm::init_inference( FFModel const &ff, std::vector const &batch_inputs, @@ -439,11 +450,11 @@ void ResidualLayerNorm::init(FFModel const &ff) { launcher.add_field(field_id++, FID_DATA); // residual2 if (use_two_residuals) { - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + launcher.add_region_requirement(RegionRequirement(inputs[2]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - inputs[1]->region)); + inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } // added: input + residual(s) @@ -723,7 +734,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -731,7 +742,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -740,7 +751,7 @@ Legion::FutureMap ResidualLayerNorm::peft_bwd( launcher.add_region_requirement( RegionRequirement(batch_inputs[2]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[2] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[2]->region_grad)); launcher.add_field(field_id++, FID_DATA); @@ -768,9 +779,7 @@ void ResidualLayerNorm::peft_bwd_task( } assert(task->regions.size() == regions.size()); ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); - assert(regions.size() == - 4 + m->use_two_residuals + - (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine); int region_idx = 0, task_region_idx = 0; @@ -807,8 +816,7 @@ void ResidualLayerNorm::peft_bwd_task( } GenericTensorAccessorR gamma; if (m->elementwise_affine) { - assert(m->use_bias == (regions.size() == 6)); - gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + gamma = helperGetGenericTensorAccessorRO(m->weight_type[0], regions[region_idx++], task->regions[task_region_idx++], FID_DATA, @@ -942,12 +950,11 @@ void ResidualLayerNorm::inference_task( assert(task->regions.size() == regions.size()); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); if (bc->num_tokens == 0) { return; } - ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); - assert(regions.size() == 4 + m->use_two_residuals + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 4bfac1887f..1f87949234 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -239,20 +239,17 @@ void ResidualLayerNorm::inference_kernel_wrapper( } assert(num_peft_requests <= 1); - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - // FIXME: use the new approach to computing token offset - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int in_dim = - added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; m->input_activation = allocator->allocate_instance_untyped( @@ -261,14 +258,14 @@ void ResidualLayerNorm::inference_kernel_wrapper( if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( m->input_activation, - added_output.get_float_ptr() + tokens_previous_requests * in_dim, + added_output.get_float_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); } else if (m->input_type[0] == DT_HALF) { checkCUDA(cudaMemcpyAsync( m->input_activation, - added_output.get_half_ptr() + tokens_previous_requests * in_dim, + added_output.get_half_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); @@ -481,6 +478,9 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, T *dX, T *dX_residual1, T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, int const N, T *buf) { auto const i1 = blockIdx.x; @@ -535,10 +535,22 @@ __device__ __inline__ void compute_gI(T const *__restrict__ dY, f_grad_input -= (x - mean_val) * rstd_val * stats_x2; f_grad_input -= stats_x1; f_grad_input *= term1; - dX_i[l] += f_grad_input; - dX_residual1_i[l] += f_grad_input; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad1) { + dX_residual1_i[l] = f_grad_input; + } else { + dX_residual1_i[l] += f_grad_input; + } if (dX_residual2 != nullptr) { - dX_residual2_i[l] += f_grad_input; + if (reset_residual_grad2) { + dX_residual2_i[l] = f_grad_input; + } else { + dX_residual2_i[l] += f_grad_input; + } } } } @@ -552,11 +564,25 @@ __global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, T *dX, T *dX_residual1, T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, int const N) { alignas(sizeof(double)) extern __shared__ char s_data1[]; T *buf = reinterpret_cast(&s_data1); - - compute_gI(dY, X, mean, rstd, gamma, dX, dX_residual1, dX_residual2, N, buf); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual1, + dX_residual2, + reset_input_grad, + reset_residual_grad1, + reset_residual_grad2, + N, + buf); } /*static*/ @@ -604,6 +630,9 @@ void backward_kernel(ResidualLayerNormMeta const *m, input_grad_ptr, residual1_grad_ptr, residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { @@ -710,28 +739,12 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, cudaStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - ComputeInternalGradientsCUDAKernel - <<>>( - N, - output_grad_ptr, - static_cast(m->input_activation), - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); - const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - ComputeGradientFusedParamsCUDAKernel - <<>>(M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( output_grad_ptr, static_cast(m->input_activation), @@ -741,6 +754,9 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, input_grad_ptr, residual1_grad_ptr, residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], N); } diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 28dd7e2745..c2fbe11544 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -90,9 +90,9 @@ void FFModel::residual_rms_norm(const Tensor input1, casted_input2); rm->outputs[0] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/); rm->outputs[1] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/); // weights int weight_dims[1] = {dim}; @@ -100,7 +100,7 @@ void FFModel::residual_rms_norm(const Tensor input1, weight_dims, data_type, rm, - true /*create_grad*/, + false /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); @@ -710,6 +710,7 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); peft_bwd_kernel_wrapper( m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 23f2eb9edf..1d062b552b 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -355,6 +355,14 @@ FutureMap Softmax::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); + // we add the region below in order to copy the output to the grad tensor + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); return runtime->execute_index_space(ctx, launcher); } @@ -363,8 +371,8 @@ void Softmax::inference_task(Task const *task, Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3); + assert(task->regions.size() == 3); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; @@ -376,7 +384,9 @@ void Softmax::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - inference_kernel_wrapper(m, bc, input, output); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + inference_kernel_wrapper(m, bc, input, output, output_grad); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -411,7 +421,7 @@ FutureMap Softmax::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1d4a9ee47c..cbb21e03e0 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -246,17 +246,13 @@ RequestManager::RequestGuid request.peft_model_id = request_.peft_model_id; request.req_type = Request::REQ_FINETUNING; request.completed_training_steps = 0; - request.max_training_steps = request_.max_training_steps; + request.max_training_steps = 1; // TODO: let user set this for (auto const &sample : request_.dataset_text) { std::vector input_tokens; input_tokens = this->tokenizer_->Encode(sample.first); if (bos_token_id >= 0 && model_type != ModelType::FALCON) { input_tokens.insert(input_tokens.begin(), bos_token_id); } - // FIXME: this is a hack, must undo - while (input_tokens.size() < 256) { - input_tokens.push_back(293); - } std::vector output_tokens = this->tokenizer_->Encode(sample.second); if (input_tokens.size() + output_tokens.size() > @@ -359,7 +355,6 @@ BatchConfig RequestManager::prepare_next_batch_task( BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { - log_req_mgr.print("[Old BC] Num tokens: %d", old_bc.num_tokens); const std::lock_guard lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens for (int i = 0; i < old_bc.num_tokens; i++) { @@ -544,8 +539,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_generation_tokens = num_generation_tokens; // Step 3: add new requests to the next batch if there is space - // FIXME: we reserve one slot for PEFT req now - for (int i = 0; i < BatchConfig::max_requests_per_batch() - 1; i++) { + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb new file mode 100644 index 0000000000..e2a8978ea3 --- /dev/null +++ b/tests/peft/alignment_tests.ipynb @@ -0,0 +1,1427 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "hf_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors\"\n", + "ff_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors\"\n", + "def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):\n", + " assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n", + " hf_tensor = torch.load(hf_tensor_filepath)\n", + " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", + " assert(len(hf_tensor) == 1)\n", + " hf_tensor = hf_tensor[0]\n", + " hf_tensor = torch.nan_to_num(hf_tensor)\n", + " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", + " ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "\n", + " len_hf_tensor = hf_tensor.shape[0]\n", + " ff_tensor = ff_tensor[:len_hf_tensor]\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n", + " print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}\")\n", + " print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n", + " print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #print(np.nonzero(hf_tensor)[0])\n", + " # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n", + " # print(ff_tensor[36], hf_tensor[36])\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len_hf_tensor)\n", + " print(\"Ok!\")\n", + "def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):\n", + " assert(os.path.exists(hf_tensor_filepath))\n", + " assert(os.path.exists(ff_tensor1_filepath))\n", + " assert(os.path.exists(ff_tensor2_filepath))\n", + " hf_tensor = torch.load(hf_tensor_filepath)\n", + " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", + " assert(len(hf_tensor) == 1)\n", + " hf_tensor = hf_tensor[0]\n", + " hf_tensor = torch.nan_to_num(hf_tensor)\n", + " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',')\n", + "\n", + " len_hf_tensor = hf_tensor.shape[0]\n", + " ff_tensor1 = ff_tensor1[:len_hf_tensor]\n", + " ff_tensor2 = ff_tensor2[:len_hf_tensor]\n", + " ff_tensor = ff_tensor1 - ff_tensor2\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n", + " print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}\")\n", + " print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n", + " print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #print(np.nonzero(hf_tensor)[0])\n", + " # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n", + " # print(ff_tensor[36], hf_tensor[36])\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len_hf_tensor)\n", + " print(\"Ok!\")\n", + "def compare_hf_tensors(tensor1_fp, tensor2_fp):\n", + " assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n", + " hf_tensor1 = torch.load(tensor1_fp)\n", + " hf_tensor2 = torch.load(tensor2_fp)\n", + " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", + " assert(len(hf_tensor1) == 1)\n", + " hf_tensor1 = hf_tensor1[0]\n", + " if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n", + " assert(len(hf_tensor2) == 1)\n", + " hf_tensor2 = hf_tensor2[0]\n", + " assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n", + " hf_tensor1 = torch.nan_to_num(hf_tensor1)\n", + " hf_tensor2 = torch.nan_to_num(hf_tensor2)\n", + " if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())):\n", + " print(f\"mismatch between {tensor1_fp} and {tensor2_fp}\")\n", + " print(hf_tensor1)\n", + " print(hf_tensor2)\n", + " print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))\n", + " mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0]\n", + " print(mismatches)\n", + " assert(False)\n", + " print(\"Ok!\")\n", + "\n", + "def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):\n", + " assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n", + " hf_tensor_sum = torch.load(tensor_sum_fp)\n", + " hf_tensor1 = torch.load(tensor1_fp)\n", + " hf_tensor2 = torch.load(tensor2_fp)\n", + " if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:\n", + " assert(len(hf_tensor_sum) == 1)\n", + " hf_tensor_sum = hf_tensor_sum[0]\n", + " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", + " assert(len(hf_tensor1) == 1)\n", + " hf_tensor1 = hf_tensor1[0]\n", + " if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n", + " assert(len(hf_tensor2) == 1)\n", + " hf_tensor2 = hf_tensor2[0]\n", + " assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape)\n", + " assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n", + " hf_tensor1 = torch.nan_to_num(hf_tensor1)\n", + " hf_tensor2 = torch.nan_to_num(hf_tensor2)\n", + " hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)\n", + " sum_check_tensor = hf_tensor1 + hf_tensor2\n", + " if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())):\n", + " print(f\"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}\")\n", + " print(tensor_sum_fp)\n", + " print(sum_check_tensor)\n", + " print(hf_tensor1)\n", + " print(hf_tensor2)\n", + " print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))\n", + " mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0]\n", + " print(mismatches)\n", + " assert(False)\n", + " print(\"Ok!\")\n", + "def check_hf_zero_tensor(hf_tensor_fp):\n", + " assert(os.path.exists(hf_tensor_fp))\n", + " hf_tensor1 = torch.load(hf_tensor_fp)\n", + " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", + " assert(len(hf_tensor1) == 1)\n", + " hf_tensor1 = hf_tensor1[0]\n", + " assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)\n", + "def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=\"\"):\n", + " assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n", + " hf_tensor = torch.load(hf_tensor_filepath)\n", + " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", + " assert(len(hf_tensor) == 1)\n", + " hf_tensor = hf_tensor[0]\n", + " hf_tensor = torch.nan_to_num(hf_tensor)\n", + " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", + " ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "\n", + " len_hf_tensor = hf_tensor.shape[0]\n", + " ff_tensor = ff_tensor[:len_hf_tensor]\n", + "\n", + " print(f\"{txt} - HF tensor:\")\n", + " print(hf_tensor)\n", + " print(f\"{txt} - FF tensor: \")\n", + " print(ff_tensor)\n", + "def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):\n", + " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", + "\n", + " if (ff_tensor1.shape != ff_tensor2.shape):\n", + " print(ff_tensor1.shape, ff_tensor2.shape)\n", + " assert(ff_tensor1.shape == ff_tensor2.shape)\n", + "\n", + " if max_len > -1:\n", + " ff_tensor1 = ff_tensor1[:max_len]\n", + " ff_tensor2 = ff_tensor2[:max_len]\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n", + " print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n", + " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", + " print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len(ff_tensor1))\n", + " print(\"Ok!\")\n", + "def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n", + " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", + " minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])\n", + " ff_tensor1 = ff_tensor1[:minlen]\n", + " ff_tensor2 = ff_tensor2[:minlen]\n", + " mismatches = []\n", + " if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n", + " print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n", + " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", + " print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len(ff_tensor1))\n", + " print(\"Ok!\")\n", + "def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n", + " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", + " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", + " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", + " ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')\n", + " \n", + " ff_sum = ff_tensor1 + ff_tensor2\n", + " assert(ff_tensor1.shape == ff_tensor2.shape)\n", + " \n", + " mismatches = []\n", + " if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):\n", + " print(f\"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}\")\n", + " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", + " print(f\"Sum Tensor: {ff_tensor_sum}\\nActual sum:{ff_sum}\")\n", + " print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))\n", + " mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]\n", + " print(mismatches)\n", + " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", + " assert(len(mismatches) <= .05*len(ff_tensor1))\n", + " print(\"Ok!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for layer_num in range(tot_num_layers):\n", + " hf_input_ln_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.input_layernorm.output_0\"\n", + " ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_RMSNorm_shard-id_0_output_0\"\n", + " if layer_num > 0:\n", + " ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_norm_shard-id_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", + " hf_attn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.o_proj.output_0\"\n", + " ff_attn_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n", + " compare_tensors(hf_attn_out, ff_attn_out)\n", + " hf_ffn_norm_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.post_attention_layernorm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_1\"\n", + " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", + " # w1\n", + " hf_gate_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n", + " ff_gate_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", + " compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n", + " # w3\n", + " hf_up_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\" \n", + " ff_up_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", + " compare_tensors(hf_up_proj_out, ff_up_proj_out)\n", + " # w2\n", + " hf_down_proj_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.input_0\"\n", + " hf_down_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.output_0\"\n", + " ff_down_proj_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n", + " ff_down_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n", + " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", + " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_weight_base_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out)\n", + "hf_lm_head_out = f\"{hf_weight_base_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "ff_BWD_softmax_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "\n", + "hf_BWD_lm_head_out = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "# compare weights\n", + "hf_lm_head_weight = f\"{hf_weight_base_path}/base_model.model.lm_head.weight\"\n", + "ff_lm_head_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_weight_0\"\n", + "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "# # Manually check the matmul\n", + "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n", + "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n", + "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n", + "# print(ff_tensor_out.shape)\n", + "# print(ff_weight.shape)\n", + "# print(np.matmul(ff_weight, ff_tensor_out))\n", + "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n", + "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "\n", + "hf_BWD_norm_out = f\"{hf_weight_base_path}/bwd_step_0_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "ff_BWD_norm_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_weight_0\"\n", + "hf_FWD_norm_weight = f\"{hf_weight_base_path}/base_model.model.model.norm.weight\"\n", + "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n", + "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "class LlamaRotaryEmbedding(nn.Module):\n", + " def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n", + " super().__init__()\n", + "\n", + " self.dim = dim\n", + " self.max_position_embeddings = max_position_embeddings\n", + " self.base = base\n", + " inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n", + " self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n", + "\n", + " # Build here to make `torch.jit.trace` work.\n", + " self._set_cos_sin_cache(\n", + " seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n", + " )\n", + "\n", + " def _set_cos_sin_cache(self, seq_len, device, dtype):\n", + " self.max_seq_len_cached = seq_len\n", + " t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n", + "\n", + " freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n", + " # Different from paper, but it uses a different permutation in order to obtain the same calculation\n", + " emb = torch.cat((freqs, freqs), dim=-1)\n", + " self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n", + " self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n", + "\n", + " def forward(self, x, seq_len=None):\n", + " # x: [bs, num_attention_heads, seq_len, head_size]\n", + " if seq_len > self.max_seq_len_cached:\n", + " self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n", + "\n", + " return (\n", + " self.cos_cached[:seq_len].to(dtype=x.dtype),\n", + " self.sin_cached[:seq_len].to(dtype=x.dtype),\n", + " )\n", + "def rotate_half(x):\n", + " \"\"\"Rotates half the hidden dims of the input.\"\"\"\n", + " x1 = x[..., : x.shape[-1] // 2] # first half\n", + " x2 = x[..., x.shape[-1] // 2 :] # second half\n", + " return torch.cat((x2, -x1), dim=-1)\n", + "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n", + " \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n", + "\n", + " Args:\n", + " q (`torch.Tensor`): The query tensor.\n", + " k (`torch.Tensor`): The key tensor.\n", + " cos (`torch.Tensor`): The cosine part of the rotary embedding.\n", + " sin (`torch.Tensor`): The sine part of the rotary embedding.\n", + " position_ids (`torch.Tensor`):\n", + " The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n", + " used to pass offsetted position ids when working with a KV-cache.\n", + " unsqueeze_dim (`int`, *optional*, defaults to 1):\n", + " The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n", + " sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n", + " that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n", + " k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n", + " cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n", + " the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n", + " Returns:\n", + " `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n", + " \"\"\"\n", + " cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n", + " sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n", + " q_embed = (q * cos) + (rotate_half(q) * sin)\n", + " k_embed = (k * cos) + (rotate_half(k) * sin)\n", + " return q_embed, k_embed\n", + "head_dim = 64\n", + "max_position_embeddings = 2048\n", + "rope_theta=10_000\n", + "kv_seq_len = 24\n", + "rotary_emb = LlamaRotaryEmbedding(\n", + " head_dim,\n", + " max_position_embeddings=max_position_embeddings,\n", + " base=rope_theta,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 2.14103374e+01\n", + " 1.20965424e+01 3.64246750e+00]\n", + "[ True True True ... True True True]\n", + "[2394]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 2.14103374e+01\n", + " 1.20965424e+01 3.64246750e+00]\n", + "[ True True True ... True True True]\n", + "[2394]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "4.383680555555555% mismatch in QK prods softmax out grad\n", + "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n", + "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n", + "4.817708333333334% mismatch in attention input grads\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 353\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for layer_num in range(tot_num_layers-1, -1, -1):\n", + " # HuggingFace filepaths\n", + " hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n", + " hf_BWD_loraB_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.go_0\"\n", + " hf_BWD_loraB_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.gi_0\"\n", + " hf_BWD_loraA_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.go_0\"\n", + " hf_BWD_loraA_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.gi_0\"\n", + " hf_loraA_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " hf_loraB_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " hf_BWD_lora_dropout_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.go_0\"\n", + " hf_BWD_lora_dropout_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.gi_0\"\n", + " hf_BWD_w2_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.go_0\"\n", + " hf_BWD_w2_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.gi_0\"\n", + " hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n", + " hf_BWD_w3_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.go_0\"\n", + " hf_BWD_w3_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.gi_0\"\n", + " hf_BWD_w1_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.go_0\"\n", + " hf_BWD_w1_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.gi_0\"\n", + " hf_BWD_act_fn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.gi_0\"\n", + " hf_BWD_act_fn_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.go_0\"\n", + " hf_BWD_ffn_norm_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.go_0\"\n", + " hf_BWD_ffn_norm_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.gi_0\"\n", + " hf_BWD_attn_out_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.go_0\"\n", + " hf_BWD_attn_q_in = f\"{hf_weight_base_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n", + " hf_FWD_w1_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n", + " hf_FWD_w3_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\"\n", + " hf_FWD_act_fn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.act_fn.output_0\"\n", + " hf_BWD_attn_oproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + " hf_attn_qproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.q_proj.weight\"\n", + " hf_attn_kproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.k_proj.weight\"\n", + " hf_attn_vproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.v_proj.weight\"\n", + " hf_attn_oproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.o_proj.weight\"\n", + " # hf_BWD_attn_vproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", + " # FlexFlow filepaths\n", + " ff_BWD_w2_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n", + " ff_BWD_w2_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n", + " ff_BWD_w2_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_pre_input_0\"\n", + " ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", + " ff_BWD_ssm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_output_0\"\n", + " ff_BWD_ssm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_0\"\n", + " ff_BWD_ssm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_1\"\n", + " ff_BWD_w3_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", + " ff_BWD_w3_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_input_0\"\n", + " ff_BWD_lora_A_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", + " ff_BWD_lora_B_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", + " ff_lora_A_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " ff_lora_B_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " ff_BWD_w1_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", + " ff_BWD_w1_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_input_0\"\n", + " ff_BWD_w1_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_pre_input_0\"\n", + " ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", + " ff_BWD_ffn_norm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_0\"\n", + " ff_BWD_ffn_norm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_1\"\n", + " ff_BWD_ffn_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_0\"\n", + " ff_BWD_attn_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n", + " ff_BWD_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_input_0\"\n", + " ff_BWD_ssm_cached_w1_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w1_output\"\n", + " ff_BWD_ssm_cached_w3_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w3_output\"\n", + " ff_FWD_w1_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", + " ff_FWD_w3_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", + " ff_FWD_act_fnc_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_act_fn_output\"\n", + " ff_BWD_attn_o_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", + " # ff_BWD_attn_v_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n", + " ff_attn_oproj_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_weight_0\"\n", + " # ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + "\n", + " # xxx = torch.load(hf_BWD_attn_out_out)\n", + " # xxx.detach().cpu().numpy().tofile(f\"{hf_BWD_attn_out_out}.flexflow\")\n", + " # print(f\"{hf_BWD_attn_out_out}.flexflow\")\n", + " \n", + " # HuggingFace checks\n", + " print(\"\\nHuggingface checks:\")\n", + " if layer_num == tot_num_layers-1:\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n", + " # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n", + " check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n", + " check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n", + "\n", + " # FlexFlow checks\n", + " print(\"\\nFlexFlow checks:\")\n", + " compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n", + " compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n", + " compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n", + " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n", + " #compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768) # should fail\n", + "\n", + " # HF-FlexFlow checks\n", + " print(\"\\nHuggingface-FlexFlow checks:\")\n", + " compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " #print(torch.load(hf_w2_weight).shape)\n", + " compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n", + " compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n", + "\n", + " compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n", + " compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", + "\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n", + "\n", + " compare_tensors(hf_FWD_w1_out, ff_FWD_w1_out)\n", + " compare_tensors(hf_FWD_w3_out, ff_FWD_w3_out)\n", + " compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", + " compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " # compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n", + " # compare_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2)\n", + " # compare_tensors(hf_BWD_attn_out_out, ff_BWD_ffn_norm_in2)\n", + " compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n", + "\n", + " # compare attn weight tensors\n", + " hidden_size = 768\n", + " qProjSize = 64\n", + " num_heads = 12\n", + " num_new_tokens = num_tokens = 24\n", + " ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n", + " ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n", + " \n", + " hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n", + " \n", + " assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n", + " \n", + " # Compare attn outproj grad in tensors\n", + " compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n", + " \n", + " ########### Compare value projs grads ######################\n", + " # 1. compare qk prods softmax\n", + " hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.qk_prods_softmax\"\n", + " ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + " \n", + " hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n", + " ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + "\n", + " for head_idx in range(num_heads):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + " \n", + " # 2. compare attn heads grads\n", + " hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + " ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", + "\n", + " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n", + " assert(np.allclose(ff_attn_heads_grads, hf_attn_heads_grads, atol=1e-2))\n", + "\n", + " # 3. vproj grads\n", + " hf_vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.go_0\"\n", + " ff_vproj_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n", + "\n", + " hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n", + " ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n", + " assert(np.allclose(hf_vproj_grads, ff_vproj_grads, atol=1e-2))\n", + "\n", + " \n", + " \n", + " \n", + " ##############################\n", + " hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", + " hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " # print(hf_value_states.shape)\n", + " ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n", + " ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n", + " # print(ff_value_states.shape)\n", + " assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n", + " \n", + " \n", + " \n", + " ########## Compare key and query projs grads ##################\n", + " ff_devQKVPRojArray = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n", + " ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n", + " ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n", + " ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n", + " ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n", + " assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n", + "\n", + " # simulate qk_prods_softmax\n", + " ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n", + " ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n", + " ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n", + " ff_value_states = torch.from_numpy(ff_value_states)\n", + " ff_value_states = ff_value_states.permute(1,0,2)\n", + " # print(ff_attn_heads_grads.shape)\n", + " # print(ff_value_states.shape)\n", + " simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n", + " #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n", + " #print(\"Simulated QK prods grads:\")\n", + " #print(simulated_qk_prods_softmax_grads[0,:,:])\n", + "\n", + " # qk prods softmax right before softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " # assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + " mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n", + " assert(pct_mismatch <= 0.05)\n", + "\n", + " # qk prods softmax right after softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.gi_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + " \n", + " # qk prods softmax after mask\n", + " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.matmul_op.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in_masked\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + "\n", + " # Compare query activation\n", + " hf_query_activation = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.query_activation\"\n", + " hf_query_activation = torch.load(hf_query_activation)\n", + " ff_query_activation = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_query_activation\"\n", + " ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n", + " hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n", + " \n", + " ########################################## ROPE and Kproj ##########################################\n", + "\n", + " # Compare FF kproj with intermediate kproj data from HF\n", + " hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n", + " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n", + " hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", + " # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", + " # Check hf ROPE \n", + " cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n", + " cos = cos.cuda()\n", + " sin = sin.cuda()\n", + " # query_states: torch.Size([1, 12, 24, 64])\n", + " # key_states: torch.Size([1, 12, 24, 64])\n", + " # position_ids: torch.Size([1, 24])\n", + " # tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " # 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + " query_states = torch.zeros([1, 12, 24, 64]).cuda()\n", + " position_ids = torch.arange(24).unsqueeze(0).cuda()\n", + " query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n", + " hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", + " # print(hf_kproj_grads_post_rotary[:,:,0])\n", + " \n", + " hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n", + " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n", + " hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", + " # print(hf_kproj_grads_before_rotary[:,:,0])\n", + " # Compare HF rope with manual ROPE\n", + " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " # Compare HF Kproj with FF Kproj (before ROPE) \n", + " ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n", + " ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", + " #print(ff_kproj_pre[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " \n", + " ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n", + " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj: \", ff_kproj.shape)\n", + " #print(ff_kproj[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " \n", + " \n", + " #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", + " hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n", + " hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n", + " #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n", + " #print(hf_kproj_grads[:,:64])\n", + " reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " #print(reshaped_tensor.shape)\n", + " assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " ########################################## Qproj (with ROPE) ##########################################\n", + "\n", + " # Compare QProj\n", + " hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n", + " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", + " # print(\"HF Qproj:\")\n", + " # print(hf_qproj_grads.shape)\n", + " reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " # print(\"\\t reshaped: \", reshaped_tensor.shape)\n", + " # print(reshaped_tensor[:,:,0])\n", + " ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n", + " ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n", + " # print(\"FF Qproj:\")\n", + " # print(ff_qproj.shape)\n", + " # print(ff_qproj[:,:,0])\n", + " assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n", + " hf_attn_in = torch.load(hf_attn_in)\n", + " # print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " hf_attn_in = hf_attn_in.squeeze().T\n", + " hf_attn_in = hf_attn_in.detach().cpu().numpy()\n", + " # print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " # print(hf_attn_in)\n", + "\n", + " ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n", + " ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n", + " # print(\"ff_attn_in: \", ff_attn_in.shape)\n", + " # print(ff_attn_in)\n", + " #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n", + "\n", + " mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n", + " mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n", + " print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n", + " assert(pct_mismatch <= 0.05)\n", + " \n", + "\n", + " assert False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([12, 24, 64])\n", + "tensor([[-1.5730e-02, -4.1161e-02, 3.0593e-02, ..., 3.8630e-01,\n", + " 3.2884e-01, 3.6067e-01],\n", + " [-2.8613e+01, -5.5872e+00, 2.9385e+01, ..., 3.8782e+01,\n", + " 9.6901e+01, 9.8470e+01],\n", + " [ 3.3027e+00, 1.8276e-01, -1.8497e+00, ..., -4.4052e+01,\n", + " -2.0010e+01, -2.9788e+01],\n", + " ...,\n", + " [-7.6471e-02, -1.8892e-01, 3.6430e-01, ..., -2.7493e-01,\n", + " 5.7017e-01, -1.5986e-01],\n", + " [ 2.5780e+00, -1.8153e+00, 2.5088e+00, ..., -1.0776e+01,\n", + " 6.2167e-01, 8.3755e-01],\n", + " [-6.8324e-02, 1.7568e-01, -3.2311e-01, ..., 3.1202e+00,\n", + " -2.6652e-01, -1.1917e+00]])\n", + "(24, 64, 12)\n", + "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", + " 3.2884139e-01 3.6066702e-01]\n", + " [-2.8613457e+01 -5.5871558e+00 2.9384506e+01 ... 3.8781765e+01\n", + " 9.6900581e+01 9.8469597e+01]\n", + " [ 3.3027239e+00 1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n", + " -2.0009745e+01 -2.9787930e+01]\n", + " ...\n", + " [-7.6470733e-02 -1.8891659e-01 3.6430117e-01 ... -2.7492592e-01\n", + " 5.7017130e-01 -1.5985624e-01]\n", + " [ 2.5780225e+00 -1.8152566e+00 2.5087588e+00 ... -1.0776262e+01\n", + " 6.2166649e-01 8.3755457e-01]\n", + " [-6.8324409e-02 1.7568478e-01 -3.2310838e-01 ... 3.1202292e+00\n", + " -2.6652411e-01 -1.1917179e+00]]\n" + ] + } + ], + "source": [ + "# value states: torch.Size([1, 12, 24, 64])\n", + "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n", + "key_states = value_states\n", + "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n", + "# query_states: torch.Size([1, 12, 24, 64])\n", + "# key_states: torch.Size([1, 12, 24, 64])\n", + "# position_ids: torch.Size([1, 24])\n", + "# tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + "# 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + "query_states = torch.zeros([1, 12, 24, 64])\n", + "position_ids = torch.arange(24).unsqueeze(0)\n", + "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n", + "key_states = key_states.squeeze()\n", + "print(key_states.shape)\n", + "print(key_states[0,:,:])\n", + "print(hf_kproj_grads_before_rotary.shape)\n", + "print(hf_kproj_grads_before_rotary[:,:,0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " 18, 19, 20, 21, 22, 23]], device='cuda:0')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.arange(24).unsqueeze(0).cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 12, 24, 24])\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 17\u001b[0m ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m 18\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> 19\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 21\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "layer_num = 11\n", + "hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n", + "ff_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + "\n", + "hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", + "\n", + "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n", + "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n", + "print(hf_qk_prods_softmax.shape)\n", + "#print(ff_qk_prods_softmax.shape)\n", + "#print(hf_qk_prods_softmax[:,:,0])\n", + "#print()\n", + "#print(ff_qk_prods_softmax[:,:,0])\n", + "\n", + "for head_idx in range(12):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + "\n", + "\n", + "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n", + "print(hf_value_states.shape)\n", + "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n", + "print()\n", + "print(attn_output.shape)\n", + "print(attn_output.transpose(1, 2).contiguous().shape)\n", + "print(\"Hf attn heads\")\n", + "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n", + "\n", + "print(\"Attn heads grads:\")\n", + "hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + "print(torch.load(hf_attn_heads_grads).shape)\n", + "print(\"HF value grads:\")\n", + "vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", + "print(torch.load(vproj_grads).shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 3, 4])\n", + "torch.Size([4, 3, 2])\n" + ] + } + ], + "source": [ + "a = torch.randn(2,3,4)\n", + "print(a.shape)\n", + "print(a.T.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000,\n", + " 0.0000],\n", + " [ 27.8890, -21.5089, 45.8214, ..., 5.4010, -10.8787,\n", + " 39.7619],\n", + " [ 19.2197, 27.4681, -68.7141, ..., 102.3280, 66.7925,\n", + " -160.8711],\n", + " ...,\n", + " [ 63.9532, 17.4273, -29.4416, ..., 101.6105, 67.5937,\n", + " -198.4432],\n", + " [ 31.2799, 13.0724, -44.7179, ..., 132.4898, 42.3135,\n", + " -194.4037],\n", + " [ 42.3453, -16.2693, -55.7386, ..., 90.5921, 52.2032,\n", + " -124.1802]]], device='cuda:0')\n", + "tensor([[[-1.1845e+06, -6.7460e+05, 7.4494e+05, ..., -9.1441e+05,\n", + " -1.4912e+05, 3.5769e+06],\n", + " [-7.3920e+01, -7.9389e+01, 1.1027e+02, ..., -7.3020e+01,\n", + " -2.3540e+01, 3.4587e+02],\n", + " [-5.3885e+01, -1.7373e+01, -1.9780e+01, ..., 4.1291e+01,\n", + " 5.5099e+01, 5.5910e+01],\n", + " ...,\n", + " [-2.1948e+01, -3.2109e+01, 2.8364e+01, ..., 3.4321e+01,\n", + " 5.0713e+01, 5.6592e+01],\n", + " [-4.4339e+01, -2.8339e+01, 1.4070e+01, ..., 6.2797e+01,\n", + " 3.0760e+01, 6.1743e+01],\n", + " [-1.6287e+01, -5.0413e+01, -1.9940e+01, ..., 4.3766e+01,\n", + " 4.7833e+01, 4.7295e+01]]], device='cuda:0')\n" + ] + } + ], + "source": [ + "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n", + "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n", + "a = torch.load(a)\n", + "b = torch.load(b)\n", + "print(a)\n", + "print(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # Manual matmul checks\n", + "# ff_w2_grad_out_tensor = np.loadtxt(ff_BWD_w2_out, delimiter=',').reshape((768,128), order='F')\n", + "# ff_w2_weight_tensor = np.loadtxt(ff_w2_weight, delimiter=',').reshape((3072,768), order='F')\n", + "# ff_w2_gradin_tensor = np.matmul(ff_w2_weight_tensor, ff_w2_grad_out_tensor).reshape((3072,128), order='F')\n", + "\n", + "# ff_lora_gradout_tensor = np.loadtxt(ff_BWD_lora_B_out, delimiter=',').reshape((768,128), order='F')\n", + "# ff_lora_A_weight_tensor = np.loadtxt(ff_lora_A_weight, delimiter=',').reshape((3072,16), order='F')\n", + "# ff_lora_B_weight_tensor = np.loadtxt(ff_lora_B_weight, delimiter=',').reshape((16,768), order='F')\n", + "# ff_lora_int_grad_tensor = np.matmul(ff_lora_B_weight_tensor, ff_lora_gradout_tensor)\n", + "# ff_lora_gradint_tensor = np.matmul(ff_lora_A_weight_tensor, ff_lora_int_grad_tensor)\n", + "\n", + "# # ff_w2_gradin_tensor = ff_w2_gradin_tensor + ff_lora_gradint_tensor\n", + "# #print(ff_w2_gradin_tensor[:,:24])\n", + "# print(\"calculated LORA grad in\")\n", + "# print(ff_lora_gradint_tensor[:,:24])\n", + "# # ff_BWD_w2_in_pre_tensor = np.loadtxt(ff_BWD_w2_in_pre, delimiter=',').reshape((3072,128), order='F')\n", + "# ff_BWD_lora_A_in_tensor = np.loadtxt(ff_BWD_lora_A_in, delimiter=',').reshape((3072,128), order='F')\n", + "# print(\"FlexFlow LORA grad in\")\n", + "# print(ff_BWD_lora_A_in_tensor[:,:24])\n", + "# # print(ff_BWD_w2_in_pre_tensor[:,:24])\n", + "# print(\"HF lora grad in\")\n", + "# print(torch.load(hf_BWD_loraA_in).squeeze().T.detach().cpu().numpy())\n", + "# compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", + "\n", + "# simulate act_fn_grad\n", + "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')\n", + "# w3_fwd_out_tensor = np.loadtxt(ff_FWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n", + "# #print(ssm_out_grad_tensor.shape, w3_fwd_out_tensor.shape)\n", + "# act_fn_out_check = np.multiply(ssm_out_grad_tensor, w3_fwd_out_tensor)\n", + "# print(\"simulated act fn out - simulated\")\n", + "# print(act_fn_out_check[:,:24])\n", + "# print(\"simulated act fn out - HF\")\n", + "# print(torch.load(hf_BWD_act_fn_out).detach().cpu().numpy().squeeze().T)\n", + "\n", + "# Simulated w3_grad\n", + "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')[:,:24]\n", + "# act_fnc_out_tensor = np.loadtxt(ff_FWD_act_fnc_out, delimiter=',').reshape((3072,24), order='F')\n", + "# w3_out_gard_check = np.multiply(ssm_out_grad_tensor, act_fnc_out_tensor)\n", + "# print(\"simulated w3 out - FF\")\n", + "# print(w3_out_gard_check)\n", + "# ff_BWD_w3_out_tensor = np.loadtxt(ff_BWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n", + "# hf_BWD_w3_out_tensor = torch.load(hf_BWD_w3_out).detach().cpu().numpy().squeeze().T\n", + "# print(\"w3 out, FF\")\n", + "# print(ff_BWD_w3_out_tensor[:,:24])\n", + "# print(\"w3 out, HF\")\n", + "# print(hf_BWD_w3_out_tensor)\n", + "\n", + "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n", + "# assert False\n", + "# print()\n", + "# print()\n", + "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n", + "# print_tensors(hf_BWD_w3_in, ff_BWD_w3_in, \"w3 in\")\n", + "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out, \"w1 out\")\n", + "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in, \"w1 in\")\n", + "# print_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out, \"ffn norm out\")\n", + "# print_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2, \"ffn norm in\")\n", + "# print()\n", + "# ff_w1_out_tensor = np.loadtxt(ff_BWD_w1_out, delimiter=',').reshape((3072,128), order='F')\n", + "# ff_w1_in_tensor = np.loadtxt(ff_BWD_w1_in, delimiter=',').reshape((768,128), order='F')\n", + "# ff_w1_in_pre_tensor = np.loadtxt(ff_BWD_w1_in_pre, delimiter=',').reshape((768,128), order='F')\n", + "# ff_w1_only_in_tensor = ff_w1_in_tensor - ff_w1_in_pre_tensor\n", + "# ff_w1_weight_tensor = np.loadtxt(ff_w1_weight, delimiter=',').reshape((768,3072), order='F')\n", + "# ff_w1_in_check_tensor = np.matmul(ff_w1_weight_tensor, ff_w1_out_tensor)\n", + "# print(\"W1 in (simulated):\")\n", + "# print(ff_w1_in_check_tensor[:,:24])\n", + "# print(\"W1 in (FF):\")\n", + "# print(ff_w1_only_in_tensor[:,:24])\n", + "# print(\"W1 in (HF):\")\n", + "# print(torch.load(hf_BWD_w1_in).squeeze().T.detach().cpu().numpy())\n", + "\n", + "# compare_tensors_difference(hf_BWD_w2_in, ff_BWD_w2_in, ff_BWD_lora_A_in)\n", + "# compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", + "#compare_hf_tensors(hf_BWD_ffn_norm_in, hf_BWD_attn_out_out)\n", + "# print(\"\\nw1 out:\")\n", + "\n", + "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + "# print(\"\\nW1 in\\n\")\n", + "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n", + "# compare_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n", + "# print(\"\\nffn_norm\")\n", + "# compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "for layer_num in range(12):\n", + " hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n", + " hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n", + " hf_w1_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.gate_proj.weight\"\n", + " ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n", + " hf_w3_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.up_proj.weight\"\n", + " ff_w3_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n", + " hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n", + " ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/qk_prods_alignment.ipynb b/tests/peft/qk_prods_alignment.ipynb new file mode 100644 index 0000000000..c2a3644b3d --- /dev/null +++ b/tests/peft/qk_prods_alignment.ipynb @@ -0,0 +1,24 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 20289009b26042bcd9527fc8b696e22c2e28ef75 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 Jan 2024 22:45:17 -0500 Subject: [PATCH 122/198] Fuse bias + relu in OPT (#1271) * fuse bias and relu in opt * fix --- include/flexflow/model.h | 3 ++- python/flexflow/serve/models/opt.py | 5 ++--- src/ops/kernels/linear_kernels.cu | 22 ++++++++++++++++++++++ src/runtime/model.cc | 27 ++++++++++++++++++++++++--- 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 7232cb3f0b..851fac94d2 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1114,7 +1114,7 @@ class FFModel { std::unordered_map>> get_bwd_edge_map() const; - // Internal funcitons + // Internal functions Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc); Legion::IndexSpace get_or_create_task_is(MachineView const &view); Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain); @@ -1122,6 +1122,7 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, std::vector const &inputs); diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index 8250c63a9a..4b0b613cca 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -216,13 +216,12 @@ def build_model(self, max_tokens_per_batch): fc1 = ffmodel.dense( ff_norm, self.opt_config.ffn_dim, - ActiMode.AC_MODE_NONE, + ActiMode.AC_MODE_RELU, True, name=f"layers_{i}_fc1", ) - activation = ffmodel.relu(fc1, False) fc2 = ffmodel.dense( - activation, + fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 8cf5db3f11..51b5e1f6f5 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -306,6 +306,18 @@ Parameter* Linear::get_parameter(int index) */ namespace Internal { +template +__global__ void AddBiasWithReLU(DT *output_ptr, + DT const *bias_ptr, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, out_dim * batch_size) { + int bias_idx = i % out_dim; + DT value = output_ptr[i] + bias_ptr[bias_idx]; + output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f; + } +} + template void forward_kernel(LinearMeta const *m, void const *input_ptr, @@ -398,6 +410,16 @@ void forward_kernel(LinearMeta const *m, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // use_bias = True if (bias_ptr != NULL) { + // fuse bias and relu + if (m->activation == AC_MODE_RELU) { + int parallelism = out_dim * batch_size; + AddBiasWithReLU<<>>( + static_cast
(output_ptr), + static_cast
(bias_ptr), + out_dim, + batch_size); + return; + } checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2ee4d4bc08..2048a2c6a2 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3249,6 +3249,27 @@ Op *FFModel::create_operator_from_layer( } } +bool FFModel::is_mlp_block(int layer_idx) const { + auto const &l = layers[layer_idx]; + if (l->op_type != OP_LINEAR) { + return false; + } + // standard opt relu + if (layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_RELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) { + return true; + } + // mlp layer with relu embedded in first dense layer + long long value; + l->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (layer_idx >= 1 && layers[layer_idx - 1]->op_type == OP_LINEAR && + activation == AC_MODE_RELU) { + return true; + } + return false; +} + void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { @@ -3293,9 +3314,9 @@ void FFModel::create_operators_from_layers() { config.tensor_parallelism_degree > 1 && (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_RELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || + // mlp layer + is_mlp_block(layer_idx) || + // llama mlp layer (l->op_type == OP_LINEAR && layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_GELU && layers[layer_idx - 2]->op_type == OP_LINEAR) || From 3bbde567361eb077e2178a38fa756eb199f9a8e2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 10 Jan 2024 15:49:06 +0000 Subject: [PATCH 123/198] fix --- src/runtime/model.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2048a2c6a2..812a432ef1 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3251,21 +3251,21 @@ Op *FFModel::create_operator_from_layer( bool FFModel::is_mlp_block(int layer_idx) const { auto const &l = layers[layer_idx]; - if (l->op_type != OP_LINEAR) { - return false; - } // standard opt relu - if (layer_idx >= 2 && layers[layer_idx - 1]->op_type == OP_RELU && + if (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_RELU && layers[layer_idx - 2]->op_type == OP_LINEAR) { return true; } // mlp layer with relu embedded in first dense layer - long long value; - l->get_int_property("activation", value); - ActiMode activation = (ActiMode)value; - if (layer_idx >= 1 && layers[layer_idx - 1]->op_type == OP_LINEAR && - activation == AC_MODE_RELU) { - return true; + if (l->op_type == OP_LINEAR && layer_idx >= 1 && + layers[layer_idx - 1]->op_type == OP_LINEAR) { + long long value; + layers[layer_idx - 1]->get_int_property("activation", value); + ActiMode activation = (ActiMode)value; + if (activation == AC_MODE_RELU) { + return true; + } } return false; } From 2ebd7f4d40661303f7097334618d52297e479f90 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 16 Jan 2024 21:44:24 -0500 Subject: [PATCH 124/198] fix --- include/flexflow/ops/kernels/linear_kernels.h | 9 ++ src/ops/kernels/linear_kernels.cu | 107 ++++++++++++++++++ src/ops/linear.cc | 29 ++--- 3 files changed, 127 insertions(+), 18 deletions(-) diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index ff33755780..bcce9a947a 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -50,6 +50,15 @@ void forward_kernel_wrapper(LinearMeta const *m, int in_dim, int out_dim, int batch_size); +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *filter_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size); void peft_bwd_kernel_wrapper(LinearMeta const *m, void *input_grad_ptr, void *output_grad_ptr, diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 51b5e1f6f5..5306be9bdf 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -170,6 +170,113 @@ void forward_kernel_wrapper(LinearMeta const *m, } } +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *weight_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->input_type[0] == DT_FLOAT) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } + + + if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) { + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->output_activation_buffer = allocator->allocate_instance_untyped( + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim); + // copy output activation + if (m->output_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->output_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [Linear] inference time = %.2lfms\n", m->op_name, elapsed); + } +} + void peft_bwd_kernel_wrapper(LinearMeta const *m, void *input_grad_ptr, void *output_grad_ptr, diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 15789ae2e9..e23a6f48ca 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -652,14 +652,15 @@ void Linear::inference_task(Task const *task, runtime); assert(bias.domain.get_volume() == static_cast(out_dim)); } - forward_kernel_wrapper(m, - input.ptr, - output.ptr, - weight.ptr, - bias.ptr, - in_dim, - out_dim, - batch_size); + inference_kernel_wrapper(m, + bc, + input.ptr, + output.ptr, + weight.ptr, + bias.ptr, + in_dim, + out_dim, + batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -719,14 +720,6 @@ FutureMap Linear::peft_bwd(FFModel const &ff, weights[0]->region, ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); launcher.add_field(2, FID_DATA); - if (use_bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(3, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } @@ -741,8 +734,8 @@ void Linear::peft_bwd_task(Task const *task, if (bc->num_active_peft_tokens() == 0) { return; } - assert(regions.size() == (3 + static_cast(m->use_bias))); - assert(task->regions.size() == (3 + static_cast(m->use_bias))); + assert(regions.size() == 3); + assert(task->regions.size() == 3 ); if (m->quantization_type == DT_NONE) { assert(m->input_type[0] == m->weight_type[0]); } From 1b2018b445fe49ea5cbb59fb5dcb30ad814340c8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 16 Jan 2024 21:59:11 -0500 Subject: [PATCH 125/198] fix --- src/ops/add_bias_residual_layer_norm.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 88a34b7eb5..a2b426ec0d 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -967,7 +967,7 @@ void AddBiasResidualLayerNorm::peft_bwd_task( assert(task->regions.size() == regions.size()); AddBiasResidualLayerNormMeta *m = *((AddBiasResidualLayerNormMeta **)task->local_args); - assert(regions.size() == 4 + m->elementwise_affine); + assert(regions.size() == 3 + m->elementwise_affine); int region_idx = 0, task_region_idx = 0; @@ -995,7 +995,6 @@ void AddBiasResidualLayerNorm::peft_bwd_task( GenericTensorAccessorR gamma; if (m->elementwise_affine) { - assert(m->use_bias == (regions.size() == 6)); gamma = helperGetGenericTensorAccessorRO(m->output_type[0], regions[region_idx++], task->regions[task_region_idx++], From bc61e9ddbe33ced6574fbf91fafc26212b8a6f56 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 26 Jan 2024 20:39:06 -0500 Subject: [PATCH 126/198] Peft alignment & debugging tools (#1288) * Revert "several hacks for performance measurement; some of the changes should be reverted" This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c. * backup * backup * updates * update * backup * backup * backup * fix * cleanup * fix * fix * fix * update * simplify tensor names * fix * fixes and updates * fixes * fix * cleanup * . * restore softmax * cleanup * update alignment scripts * newline --- .gitignore | 3 +- include/flexflow/operator.h | 26 +- src/ops/add_bias_residual_layer_norm.cu | 84 +- src/ops/argmax.cc | 5 - src/ops/inc_multihead_self_attention.cu | 102 + src/ops/kernels/linear_kernels.cu | 21 +- src/ops/kernels/residual_rms_norm_kernels.cu | 45 +- src/ops/kernels/rms_norm_kernels.cu | 76 +- src/ops/layer_norm.cu | 69 +- src/ops/linear.cc | 20 +- src/ops/lora_linear.cc | 34 +- src/ops/residual_layer_norm.cu | 188 +- tests/peft/alignment/align_test_utils.py | 240 ++ .../alignment/llama_alignment_tests.ipynb | 2039 +++++++++++++++++ .../peft/alignment/opt_alignment_tests.ipynb | 450 ++++ tests/peft/alignment_tests.ipynb | 1427 ------------ tests/peft/hf_finetune.py | 70 +- tests/peft/hf_serve.py | 8 - tests/peft/qk_prods_alignment.ipynb | 24 - 19 files changed, 3161 insertions(+), 1770 deletions(-) create mode 100644 tests/peft/alignment/align_test_utils.py create mode 100644 tests/peft/alignment/llama_alignment_tests.ipynb create mode 100644 tests/peft/alignment/opt_alignment_tests.ipynb delete mode 100644 tests/peft/alignment_tests.ipynb delete mode 100644 tests/peft/qk_prods_alignment.ipynb diff --git a/.gitignore b/.gitignore index 0579eb5a74..23da3c5899 100644 --- a/.gitignore +++ b/.gitignore @@ -189,4 +189,5 @@ python/flexflow/version.txt inference_tensors hf_peft_tensors -Untitled-1.ipynb \ No newline at end of file +Untitled-1.ipynb +Untitled-2.ipynb diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index e3f28756ec..2dfba77b77 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -267,7 +267,7 @@ class Op { bool fwd_pass = true, bool before_kernel = false) { // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; + char const *folder_path = "./inference_tensors/"; struct stat st = {0}; if (stat(folder_path, &st) == -1) { // Directory does not exist, create it @@ -275,20 +275,26 @@ class Op { } // output base filepath, shared by all tensors from the same operator std::string op_name_without_uid = get_op_name_without_uid(m); - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - (fwd_pass ? "_decoding-step_" : "_bwd-step_") + - (fwd_pass ? std::to_string(m->decoding_step) - : std::to_string(m->bwd_step)) + - "_layer-num_" + std::to_string(m->layer_guid.transformer_layer_id) + - "_layer-name_" + op_name_without_uid + "_shard-id_" + - std::to_string(shard_id); + std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid + << std::endl; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + if (fwd_pass) { + base_filepath += "fwd_step_" + std::to_string(m->decoding_step); + } else { + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + } + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(shard_id); if (before_kernel) { base_filepath += "_pre"; } // save batch config, if passed if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); + bc->save_to_file(base_filepath + "_batch_config"); } // save all inputs for (int i = 0; i < input_tensors.size(); i++) { diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index ab017ed46c..505806a2b9 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -91,25 +91,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - template __global__ void LayerNormFusedForwardKernel(int64_t N, int64_t attn_bias_dim, @@ -128,20 +109,17 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const int64_t bias_idx = index % attn_bias_dim; X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index]; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -153,7 +131,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -179,30 +157,22 @@ void AddBiasResidualLayerNorm::inference_kernel( T const *gamma_ptr, T const *beta_ptr, cudaStream_t stream) { - - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - LayerNormFusedForwardKernel - <<>>(m->effective_num_elements, - attn_bias_dim, - m->eps, - input_ptr, - attn_bias_ptr, - residual_ptr, - added_output_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - output_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + attn_bias_dim, + m->eps, + input_ptr, + attn_bias_ptr, + residual_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); } /*static*/ @@ -242,20 +212,17 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( } assert(num_peft_requests <= 1); - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - // FIXME: use the new approach to computing token offset - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int in_dim = - added_output.domain.hi()[0] - added_output.domain.lo()[0] + 1; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; m->input_activation = allocator->allocate_instance_untyped( @@ -264,14 +231,14 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( m->input_activation, - added_output.get_float_ptr() + tokens_previous_requests * in_dim, + added_output.get_float_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); } else if (m->input_type[0] == DT_HALF) { checkCUDA(cudaMemcpyAsync( m->input_activation, - added_output.get_half_ptr() + tokens_previous_requests * in_dim, + added_output.get_half_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); @@ -281,6 +248,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( } } } + // inference kernel int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1; int residual_volume = residual.domain.get_volume(); diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index dd0e2bb822..cabb8b204f 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -392,11 +392,6 @@ InferenceResult GenericTensorAccessorW parent; int batch_size = bc->num_active_infr_tokens(); ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); - // Note that we free activation allocator here since argmax is the - // last operator in forward - if (m->handle.peft_activation_allocator != nullptr) { - m->handle.peft_activation_allocator->free_all(); - } InferenceResult ir; if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 452a8c09f6..4c3b0ee4b6 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -907,6 +907,22 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m, m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } +std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(shard_id); + return base_filepath; +} + template void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -934,6 +950,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; // } // #endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; @@ -995,6 +1012,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_o_proj_in_grad"; + save_tensor(C, m_ * n_, filename.c_str()); + } } // Step 2: compute gradients w.r.t. value { @@ -1046,6 +1069,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_v_proj_in_grad"; + save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + "_qk_prods_softmax"; + save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str()); + } } // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor { @@ -1094,6 +1126,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_qk_prods_softmax_grad"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + std::string filename2 = get_peft_dbg_folder(m, shard_id) + "_vcache"; + save_tensor( + B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str()); + } } // Step 4: softmax backpropagation { @@ -1120,6 +1161,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, &beta, m->qk_tensor, m->qk_prods)); + + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_qk_prods_softmax_grad_in"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + // TODO: fill all elements above diagonal to force causal attention size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; if (entries_above_diagonal > 0) { @@ -1135,6 +1185,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, DT(0.0f)); } + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = get_peft_dbg_folder(m, shard_id) + + "_qk_prods_softmax_grad_in_masked"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } } // Step 5: compute gradients w.r.t. key { @@ -1189,6 +1246,16 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_query_activation"; + save_tensor( + B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + "_devkproj_pre"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str()); + } } // Step 6: compute gradients w.r.t query { @@ -1239,7 +1306,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_devQKVPRojArray_pre"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } } + // Step 7: perform rotary position embeddings (RoPE) bwd { if (*m->apply_rotary_embedding) { @@ -1257,8 +1332,30 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->qProjSize, num_tokens, m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_devQKVPRojArray"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + if (m->inference_debugging) { + std::string filename = get_peft_dbg_folder(m, shard_id) + "_devkproj"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); } } + // Step 8: compute gradients w.r.t. input { float alpha = 1.0f, beta = 0.0f; @@ -1300,6 +1397,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + "_attn_final_grad_in"; + save_tensor(C, num_tokens * m->qSize, filename.c_str()); + } } } } diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 5306be9bdf..a3f5c797de 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -200,17 +200,16 @@ void inference_kernel_wrapper(LinearMeta *m, stream); } else if (m->input_type[0] == DT_HALF) { Internal::forward_kernel(m, - input_ptr, - output_ptr, - weight_ptr, - bias_ptr, - in_dim, - out_dim, - batch_size, - stream); + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); } - if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) { // save input activation if needed for PEFT if (bc->num_active_peft_tokens() > 0) { @@ -247,14 +246,14 @@ void inference_kernel_wrapper(LinearMeta *m, if (m->output_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( m->output_activation_buffer, - static_cast(output_ptr) + first_token_offset * out_dim, + static_cast(output_ptr) + first_token_offset * out_dim, data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, cudaMemcpyDeviceToDevice, stream)); } else if (m->output_type[0] == DT_HALF) { checkCUDA(cudaMemcpyAsync( m->output_activation_buffer, - static_cast(output_ptr) + first_token_offset * out_dim, + static_cast(output_ptr) + first_token_offset * out_dim, data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, cudaMemcpyDeviceToDevice, stream)); diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index b12d105c1b..664c1ed13b 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -221,7 +221,28 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, assert(weight.data_type == output.data_type); assert(residual_output.data_type == output.data_type); - // save input activation if needed for PEFT + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + // save input activation if needed for PEFT. This must be done after the + // forward kernel since that's where we add the residual if (bc->num_active_peft_tokens() > 0) { // Check that we have at most one request that requires peft_bwd int num_peft_requests = 0; @@ -247,7 +268,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; @@ -275,26 +296,6 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, } } - if (output.data_type == DT_HALF) { - forward_kernel(m, - input1.get_half_ptr(), - input2.get_half_ptr(), - weight.get_half_ptr(), - residual_output.get_half_ptr(), - output.get_half_ptr(), - stream); - } else if (output.data_type == DT_FLOAT) { - forward_kernel(m, - input1.get_float_ptr(), - input2.get_float_ptr(), - weight.get_float_ptr(), - residual_output.get_float_ptr(), - output.get_float_ptr(), - stream); - } else { - assert(false && "Unsupported data type"); - } - if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index d0702d651e..b11e954622 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -201,53 +201,53 @@ void inference_kernel_wrapper(RMSNormMeta *m, // save input activation if needed for PEFT if (bc->num_active_peft_tokens() > 0) { - // check that at most one dimension after the first is > 1. TODO(goliaro): - // support case where this condition does not hold - int non_unit_dims_encountered = 0; - for (int i = 1; i < input.domain.get_dim(); i++) { - int dim_i = input.domain.hi()[i] - input.domain.lo()[i] + 1; - if (dim_i > 1) { - non_unit_dims_encountered++; + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; } } - assert(non_unit_dims_encountered <= 1); - - // allocate space for all peft tokens - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(input.data_type) * bc->num_active_peft_tokens() * - in_dim); - - int tokens_previous_requests = 0; + assert(num_peft_requests <= 1); for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } - // Skip non-PEFT requests and PEFT forward-only requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || - !bc->requestsInfo[i].peft_bwd) { - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - - if (input.data_type == DT_FLOAT) { - checkCUDA(cudaMemcpyAsync( - m->input_activation, - input.get_float_ptr() + tokens_previous_requests * in_dim, - data_type_size(input.data_type) * num_peft_tokens * in_dim, - cudaMemcpyDeviceToDevice, - stream)); - } else if (input.data_type == DT_HALF) { - checkCUDA(cudaMemcpyAsync( - m->input_activation, - input.get_half_ptr() + tokens_previous_requests * in_dim, - data_type_size(input.data_type) * num_peft_tokens * in_dim, - cudaMemcpyDeviceToDevice, - stream)); - } else { - assert(false && "unsupport datatype in layernorm"); + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = allocator->allocate_instance_untyped( + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + + if (input.data_type == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (input.data_type == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } } } } diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 1d4e94d7d5..bfbb2faae9 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -96,25 +96,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - template __global__ void LayerNormFusedForwardKernel(int64_t N, float eps, @@ -129,18 +110,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -152,7 +128,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -173,25 +149,18 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *beta_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - LayerNormFusedForwardKernel - <<>>(m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - out_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + out_ptr); } /*static*/ @@ -276,18 +245,16 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, } assert(num_peft_requests <= 1); - int tokens_previous_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } // Skip non-PEFT requests if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { - // FIXME: use the new approach to computing token offset - tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; @@ -297,14 +264,14 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( m->input_activation, - input.get_float_ptr() + tokens_previous_requests * in_dim, + input.get_float_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); } else if (m->input_type[0] == DT_HALF) { checkCUDA(cudaMemcpyAsync( m->input_activation, - input.get_half_ptr() + tokens_previous_requests * in_dim, + input.get_half_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index e23a6f48ca..209f514f65 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -621,6 +621,8 @@ void Linear::inference_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + std::string op_name_without_uid = Linear::get_op_name_without_uid(m); + printf("INF %s\n", op_name_without_uid.c_str()); if (bc->num_tokens == 0) { return; } @@ -653,14 +655,14 @@ void Linear::inference_task(Task const *task, assert(bias.domain.get_volume() == static_cast(out_dim)); } inference_kernel_wrapper(m, - bc, - input.ptr, - output.ptr, - weight.ptr, - bias.ptr, - in_dim, - out_dim, - batch_size); + bc, + input.ptr, + output.ptr, + weight.ptr, + bias.ptr, + in_dim, + out_dim, + batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -735,7 +737,7 @@ void Linear::peft_bwd_task(Task const *task, return; } assert(regions.size() == 3); - assert(task->regions.size() == 3 ); + assert(task->regions.size() == 3); if (m->quantization_type == DT_NONE) { assert(m->input_type[0] == m->weight_type[0]); } diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index e39b444af4..c02bddc5a6 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -473,7 +473,7 @@ void LoraLinear::inference_task(Task const *task, int shard_id = task->index_point.point_data[0]; // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; + char const *folder_path = "./inference_tensors/"; struct stat st = {0}; if (stat(folder_path, &st) == -1) { // Directory does not exist, create it @@ -493,15 +493,18 @@ void LoraLinear::inference_task(Task const *task, lora_layername.substr(0, found + searchString.length()); // output base filepath, shared by all tensors from the same operator - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - lora_layername_substr + "_shard-id_" + std::to_string(shard_id); + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "fwd_step_" + std::to_string(m->decoding_step); + base_filepath += + "_layers_" + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + lora_layername_substr + "_shard_" + std::to_string(shard_id); // save batch config, if passed if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); + bc->save_to_file(base_filepath + "_batch_config"); } std::string filename = base_filepath + "_input_" + std::to_string(0); @@ -634,7 +637,7 @@ void LoraLinear::peft_bwd_task(Task const *task, int shard_id = task->index_point.point_data[0]; // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; + char const *folder_path = "./inference_tensors/"; struct stat st = {0}; if (stat(folder_path, &st) == -1) { // Directory does not exist, create it @@ -654,15 +657,18 @@ void LoraLinear::peft_bwd_task(Task const *task, lora_layername.substr(0, found + searchString.length()); // output base filepath, shared by all tensors from the same operator - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_bwd-step_" + std::to_string(m->bwd_step) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - lora_layername_substr + "_shard-id_" + std::to_string(shard_id); + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + base_filepath += + "_layers_" + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + lora_layername_substr + "_shard_" + std::to_string(shard_id); // save batch config, if passed if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); + bc->save_to_file(base_filepath + "_batch_config"); } std::string filename = base_filepath + "_input_" + std::to_string(0); diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 1f87949234..0ba462cde5 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -91,25 +91,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - template __global__ void ResidualLayerNormKernel(int64_t N, float eps, @@ -127,8 +108,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T residual2_val = (residual2_ptr == nullptr) ? T(0) @@ -137,12 +117,10 @@ __global__ void ResidualLayerNormKernel(int64_t N, sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -154,7 +132,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -178,28 +156,51 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, T const *beta_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - ResidualLayerNormKernel - <<>>(m->effective_num_elements, - m->eps, - input_ptr, - residual1_ptr, - residual2_ptr, - added_output_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - output_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + input_ptr, + residual1_ptr, + residual2_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); +} +template +void save_inference_tensors(ResidualLayerNormMeta const *m) { + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "fwd_step_" + std::to_string(m->decoding_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } } /*static*/ @@ -222,6 +223,33 @@ void ResidualLayerNorm::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + + if (m->input_type[0] == DT_FLOAT) { + ResidualLayerNorm::inference_kernel( + m, + input.get_float_ptr(), + residual1.get_float_ptr(), + m->use_two_residuals ? residual2.get_float_ptr() : nullptr, + added_output.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + ResidualLayerNorm::inference_kernel( + m, + input.get_half_ptr(), + residual1.get_half_ptr(), + m->use_two_residuals ? residual2.get_half_ptr() : nullptr, + added_output.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + // save input activation if needed for PEFT if (bc->num_active_peft_tokens() > 0) { // Check that we have at most one request that requires peft_bwd @@ -248,7 +276,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { MemoryAllocator *allocator = m->handle.peft_activation_allocator; @@ -276,30 +304,14 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } - if (m->input_type[0] == DT_FLOAT) { - ResidualLayerNorm::inference_kernel( - m, - input.get_float_ptr(), - residual1.get_float_ptr(), - m->use_two_residuals ? residual2.get_float_ptr() : nullptr, - added_output.get_float_ptr(), - output.get_float_ptr(), - m->elementwise_affine ? gamma.get_float_ptr() : nullptr, - (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, - stream); - } else if (m->input_type[0] == DT_HALF) { - ResidualLayerNorm::inference_kernel( - m, - input.get_half_ptr(), - residual1.get_half_ptr(), - m->use_two_residuals ? residual2.get_half_ptr() : nullptr, - added_output.get_half_ptr(), - output.get_half_ptr(), - m->elementwise_affine ? gamma.get_half_ptr() : nullptr, - (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, - stream); - } else { - assert(false && "unsupport datatype in layernorm"); + if (m->inference_debugging) { + if (m->input_type[0] == DT_FLOAT) { + save_inference_tensors(m); + } else if (m->input_type[0] == DT_HALF) { + save_inference_tensors(m); + } else { + assert(false && "unsupport datatype in layernorm"); + } } if (m->profiling) { @@ -740,6 +752,34 @@ void peft_bwd_kernel(ResidualLayerNormMeta const *m, const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } + int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py new file mode 100644 index 0000000000..b0cb5fe428 --- /dev/null +++ b/tests/peft/alignment/align_test_utils.py @@ -0,0 +1,240 @@ +import os, re, torch +import numpy as np +abs_dirname = os.path.dirname(os.path.abspath(__file__)) +hf_path = os.path.join(abs_dirname, "hf_peft_tensors") +ff_path = os.path.join(os.path.dirname(os.path.dirname(abs_dirname)), "build", "inference_tensors") +def print_unique_files_list(dirname): + files_list = os.listdir(dirname) + for f in sorted(files_list): + match = re.search(r'layers.\d+', f) + if match: + if "layers." in match[0]: + layer_num = int(match[0].split(".")[1]) + if layer_num > 0: + files_list.remove(f) + elif "layers_" in match[0]: + layer_num = int(match[0].split("_")[1]) + if layer_num > 0 and layer_num != 100: + files_list.remove(f) + return sorted(files_list) +def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2): + if not (os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)): + print(hf_tensor_filepath, os.path.exists(hf_tensor_filepath)) + print(ff_tensor_filepath, os.path.exists(ff_tensor_filepath)) + assert False + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert(len(hf_tensor) == 1) + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',') + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor = ff_tensor[:len_hf_tensor] + + mismatches = [] + if not np.allclose(ff_tensor, hf_tensor, atol=tolerance): + print(f"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}") + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(ff_tensor, hf_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0] + print(mismatches) + #print(np.nonzero(hf_tensor)[0]) + # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0]) + # print(ff_tensor[36], hf_tensor[36]) + #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert(len(mismatches) <= .05*len_hf_tensor) + print("Ok!") +def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2): + assert(os.path.exists(hf_tensor_filepath)) + assert(os.path.exists(ff_tensor1_filepath)) + assert(os.path.exists(ff_tensor2_filepath)) + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert(len(hf_tensor) == 1) + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',') + ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',') + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor1 = ff_tensor1[:len_hf_tensor] + ff_tensor2 = ff_tensor2[:len_hf_tensor] + ff_tensor = ff_tensor1 - ff_tensor2 + + mismatches = [] + if not np.allclose(ff_tensor, hf_tensor, atol=tolerance): + print(f"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}") + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(ff_tensor, hf_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0] + print(mismatches) + #print(np.nonzero(hf_tensor)[0]) + # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0]) + # print(ff_tensor[36], hf_tensor[36]) + #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert(len(mismatches) <= .05*len_hf_tensor) + print("Ok!") +def compare_hf_tensors(tensor1_fp, tensor2_fp): + assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp)) + hf_tensor1 = torch.load(tensor1_fp) + hf_tensor2 = torch.load(tensor2_fp) + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert(len(hf_tensor1) == 1) + hf_tensor1 = hf_tensor1[0] + if type(hf_tensor2) == tuple or type(hf_tensor2) == list: + assert(len(hf_tensor2) == 1) + hf_tensor2 = hf_tensor2[0] + assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape) + hf_tensor1 = torch.nan_to_num(hf_tensor1) + hf_tensor2 = torch.nan_to_num(hf_tensor2) + if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())): + print(f"mismatch between {tensor1_fp} and {tensor2_fp}") + print(hf_tensor1) + print(hf_tensor2) + print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())) + mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0] + print(mismatches) + assert(False) + print("Ok!") + +def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp): + assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp)) + hf_tensor_sum = torch.load(tensor_sum_fp) + hf_tensor1 = torch.load(tensor1_fp) + hf_tensor2 = torch.load(tensor2_fp) + if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list: + assert(len(hf_tensor_sum) == 1) + hf_tensor_sum = hf_tensor_sum[0] + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert(len(hf_tensor1) == 1) + hf_tensor1 = hf_tensor1[0] + if type(hf_tensor2) == tuple or type(hf_tensor2) == list: + assert(len(hf_tensor2) == 1) + hf_tensor2 = hf_tensor2[0] + assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape) + assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape) + hf_tensor1 = torch.nan_to_num(hf_tensor1) + hf_tensor2 = torch.nan_to_num(hf_tensor2) + hf_tensor_sum = torch.nan_to_num(hf_tensor_sum) + sum_check_tensor = hf_tensor1 + hf_tensor2 + if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())): + print(f"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}") + print(tensor_sum_fp) + print(sum_check_tensor) + print(hf_tensor1) + print(hf_tensor2) + print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())) + mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0] + print(mismatches) + assert(False) + print("Ok!") +def check_hf_zero_tensor(hf_tensor_fp): + assert(os.path.exists(hf_tensor_fp)) + hf_tensor1 = torch.load(hf_tensor_fp) + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert(len(hf_tensor1) == 1) + hf_tensor1 = hf_tensor1[0] + assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0) +def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=""): + assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)) + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert(len(hf_tensor) == 1) + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',') + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor = ff_tensor[:len_hf_tensor] + + print(f"{txt} - HF tensor:") + print(hf_tensor) + print(f"{txt} - FF tensor: ") + print(ff_tensor) +def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1): + assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)) + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',') + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',') + + if (ff_tensor1.shape != ff_tensor2.shape): + print(ff_tensor1.shape, ff_tensor2.shape) + assert(ff_tensor1.shape == ff_tensor2.shape) + + if max_len > -1: + ff_tensor1 = ff_tensor1[:max_len] + ff_tensor2 = ff_tensor2[:max_len] + + mismatches = [] + if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance): + print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}") + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0] + print(mismatches) + #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert(len(mismatches) <= .05*len(ff_tensor1)) + print("Ok!") +def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5): + assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)) + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',') + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',') + minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0]) + ff_tensor1 = ff_tensor1[:minlen] + ff_tensor2 = ff_tensor2[:minlen] + mismatches = [] + if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance): + print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}") + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0] + print(mismatches) + #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert(len(mismatches) <= .05*len(ff_tensor1)) + print("Ok!") +def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5): + assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)) + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',') + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',') + ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',') + + ff_sum = ff_tensor1 + ff_tensor2 + assert(ff_tensor1.shape == ff_tensor2.shape) + + mismatches = [] + if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance): + print(f"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}") + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(f"Sum Tensor: {ff_tensor_sum}\nActual sum:{ff_sum}") + print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0] + print(mismatches) + #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert(len(mismatches) <= .05*len(ff_tensor1)) + print("Ok!") +def load_ff_tensor(filename, shape): + if ff_path not in filename: + filename = os.path.join(ff_path, filename) + ff_tensor = np.loadtxt(filename, delimiter=',').reshape(shape, order = 'F') + return ff_tensor +def load_hf_tensor(filename): + if hf_path not in filename: + filename = os.path.join(hf_path, filename) + hf_tensor = torch.load(filename) + hf_tensor = hf_tensor.detach().cpu().numpy() + return hf_tensor +def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2): + assert(hf_tensor.shape == ff_tensor.shape) + mismatches = [] + if not np.allclose(hf_tensor, ff_tensor, atol=tolerance): + print(f"mismatch between hf_tensor and ff_tensor") + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(hf_tensor, ff_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] + print(mismatches) + len_hf_tensor = hf_tensor.flatten().shape[0] + assert(len(mismatches) <= .05*len_hf_tensor) + print("Ok!") \ No newline at end of file diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb new file mode 100644 index 0000000000..414280cff5 --- /dev/null +++ b/tests/peft/alignment/llama_alignment_tests.ipynb @@ -0,0 +1,2039 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch\n", + "from align_test_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for i in range(tot_num_layers):\n", + " hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_RMSNorm_shard_0_output_0\"\n", + " if i > 0:\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_norm_shard_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", + " hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n", + " ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n", + " compare_tensors(hf_attn_out, ff_attn_out)\n", + " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_1\"\n", + " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", + " # w1\n", + " hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", + " ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n", + " compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n", + " # w3\n", + " hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n", + " ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n", + " compare_tensors(hf_up_proj_out, ff_up_proj_out)\n", + " # w2\n", + " hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n", + " hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n", + " ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n", + " ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n", + " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", + " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out)\n", + "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- LM head --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Final Norm --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "# ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "print(\"-- LM head --\")\n", + "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "# compare weights\n", + "hf_lm_head_weight = f\"{hf_path}/base_model.model.lm_head.weight\"\n", + "ff_lm_head_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0\"\n", + "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "# # Manually check the matmul\n", + "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n", + "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n", + "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n", + "# print(ff_tensor_out.shape)\n", + "# print(ff_weight.shape)\n", + "# print(np.matmul(ff_weight, ff_tensor_out))\n", + "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n", + "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "print(\"-- Final Norm --\")\n", + "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "ff_BWD_norm_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0\"\n", + "hf_FWD_norm_weight = f\"{hf_path}/norm.weight\"\n", + "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n", + "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "class LlamaRotaryEmbedding(nn.Module):\n", + " def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n", + " super().__init__()\n", + "\n", + " self.dim = dim\n", + " self.max_position_embeddings = max_position_embeddings\n", + " self.base = base\n", + " inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n", + " self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n", + "\n", + " # Build here to make `torch.jit.trace` work.\n", + " self._set_cos_sin_cache(\n", + " seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n", + " )\n", + "\n", + " def _set_cos_sin_cache(self, seq_len, device, dtype):\n", + " self.max_seq_len_cached = seq_len\n", + " t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n", + "\n", + " freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n", + " # Different from paper, but it uses a different permutation in order to obtain the same calculation\n", + " emb = torch.cat((freqs, freqs), dim=-1)\n", + " self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n", + " self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n", + "\n", + " def forward(self, x, seq_len=None):\n", + " # x: [bs, num_attention_heads, seq_len, head_size]\n", + " if seq_len > self.max_seq_len_cached:\n", + " self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n", + "\n", + " return (\n", + " self.cos_cached[:seq_len].to(dtype=x.dtype),\n", + " self.sin_cached[:seq_len].to(dtype=x.dtype),\n", + " )\n", + "def rotate_half(x):\n", + " \"\"\"Rotates half the hidden dims of the input.\"\"\"\n", + " x1 = x[..., : x.shape[-1] // 2] # first half\n", + " x2 = x[..., x.shape[-1] // 2 :] # second half\n", + " return torch.cat((x2, -x1), dim=-1)\n", + "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n", + " \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n", + "\n", + " Args:\n", + " q (`torch.Tensor`): The query tensor.\n", + " k (`torch.Tensor`): The key tensor.\n", + " cos (`torch.Tensor`): The cosine part of the rotary embedding.\n", + " sin (`torch.Tensor`): The sine part of the rotary embedding.\n", + " position_ids (`torch.Tensor`):\n", + " The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n", + " used to pass offsetted position ids when working with a KV-cache.\n", + " unsqueeze_dim (`int`, *optional*, defaults to 1):\n", + " The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n", + " sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n", + " that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n", + " k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n", + " cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n", + " the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n", + " Returns:\n", + " `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n", + " \"\"\"\n", + " cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n", + " sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n", + " q_embed = (q * cos) + (rotate_half(q) * sin)\n", + " k_embed = (k * cos) + (rotate_half(k) * sin)\n", + " return q_embed, k_embed\n", + "head_dim = 64\n", + "max_position_embeddings = 2048\n", + "rope_theta=10_000\n", + "kv_seq_len = 24\n", + "rotary_emb = LlamaRotaryEmbedding(\n", + " head_dim,\n", + " max_position_embeddings=max_position_embeddings,\n", + " base=rope_theta,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43525000e+03 -6.48986062e+05 1.17611250e+05 ... 2.14103413e+01\n", + " 1.20965385e+01 3.64246368e+00]\n", + "[False True True ... True True True]\n", + "[ 0 162 185 308 339 745 747 820 830 909 933 968 1008 1156\n", + " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n", + " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n", + " 2888 2894 2937 3024]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_feed_forward_w2_shard_0_input_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43525000e+03 -6.48986062e+05 1.17611250e+05 ... 2.14103413e+01\n", + " 1.20965385e+01 3.64246368e+00]\n", + "[False True True ... True True True]\n", + "[ 0 162 185 308 339 745 747 820 830 909 933 968 1008 1156\n", + " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n", + " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n", + " 2888 2894 2937 3024]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_attention_shard_0_o_proj_in_grad\n", + "HF: [ 1.2223595e+06 -2.6348565e+06 -5.0760525e+05 ... 6.8275871e+01\n", + " -5.8116108e+01 9.5347488e+01]\n", + "FF:[ 1.22235925e+06 -2.63485625e+06 -5.07605000e+05 ... 6.82758865e+01\n", + " -5.81161423e+01 9.53475494e+01]\n", + "[ True True True ... True True True]\n", + "[ 51 77 95 168 175 232 725]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 1.22235950e+06 9.93645859e+01 -2.82157593e+01 ... -3.94578514e+01\n", + " -1.98409653e+01 -1.33438044e+01]\n", + " [-2.63485650e+06 -1.13461929e+02 1.14223976e+02 ... 7.52578735e+01\n", + " 1.33362747e+02 6.78501587e+01]\n", + " [-5.07605250e+05 4.34111862e+01 8.10619354e+01 ... 4.70537224e+01\n", + " 4.02149696e+01 6.98045502e+01]\n", + " ...\n", + " [ 3.02792250e+06 3.31295319e+02 9.98417091e+00 ... 4.90895653e+01\n", + " 9.71413574e+01 6.82758713e+01]\n", + " [-3.64456375e+06 -2.43692596e+02 -6.85474396e+00 ... -3.71503868e+01\n", + " -1.34136658e+01 -5.81161079e+01]\n", + " [ 3.31921500e+06 2.24193970e+02 -6.64005566e+00 ... 2.11662292e+00\n", + " 3.37400856e+01 9.53474884e+01]]\n", + "FF:[[ 1.22235925e+06 9.93645630e+01 -2.82157211e+01 ... -3.94577713e+01\n", + " -1.98408775e+01 -1.33438234e+01]\n", + " [-2.63485625e+06 -1.13461960e+02 1.14224037e+02 ... 7.52577744e+01\n", + " 1.33362701e+02 6.78501205e+01]\n", + " [-5.07605000e+05 4.34111404e+01 8.10619278e+01 ... 4.70536804e+01\n", + " 4.02149124e+01 6.98045578e+01]\n", + " ...\n", + " [ 3.02792250e+06 3.31295227e+02 9.98412323e+00 ... 4.90895386e+01\n", + " 9.71413727e+01 6.82758865e+01]\n", + " [-3.64456400e+06 -2.43692627e+02 -6.85472488e+00 ... -3.71504822e+01\n", + " -1.34137001e+01 -5.81161423e+01]\n", + " [ 3.31921500e+06 2.24193970e+02 -6.64004517e+00 ... 2.11670875e+00\n", + " 3.37400322e+01 9.53475494e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 51 77 95 168 175 232 725]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 1.2223588e+06 -2.6348530e+06 -5.0760291e+05 ... 3.0279325e+06\n", + " -3.6445672e+06 3.3192180e+06]\n", + " [-4.2496326e+02 1.1576636e+03 9.8397858e+02 ... 1.6480791e+03\n", + " -5.9697235e+02 6.2627173e+02]\n", + " [-2.2012039e+01 6.6097900e+01 3.9933994e+01 ... 5.7103355e+01\n", + " -1.5968766e+01 3.6536639e+00]\n", + " ...\n", + " [-1.2302110e+00 5.3052688e+00 2.1982718e+00 ... 1.3990868e+00\n", + " -5.5132383e-01 4.8985812e-01]\n", + " [-1.0771493e+00 6.9571300e+00 2.7373023e+00 ... 4.9663010e+00\n", + " -9.9705428e-01 2.1829298e+00]\n", + " [-5.9534687e-01 3.0272012e+00 3.1143982e+00 ... 2.4072502e+00\n", + " -2.0490403e+00 3.3617332e+00]]\n", + "FF:[[ 1.22235850e+06 -2.63485275e+06 -5.07602656e+05 ... 3.02793250e+06\n", + " -3.64456750e+06 3.31921800e+06]\n", + " [-4.24962585e+02 1.15766296e+03 9.83978577e+02 ... 1.64807898e+03\n", + " -5.96972351e+02 6.26271790e+02]\n", + " [-2.20120354e+01 6.60979462e+01 3.99340210e+01 ... 5.71033745e+01\n", + " -1.59687757e+01 3.65366316e+00]\n", + " ...\n", + " [-1.23020661e+00 5.30526114e+00 2.19826817e+00 ... 1.39908671e+00\n", + " -5.51325083e-01 4.89858717e-01]\n", + " [-1.07714510e+00 6.95712519e+00 2.73729825e+00 ... 4.96630049e+00\n", + " -9.97055829e-01 2.18292713e+00]\n", + " [-5.95347941e-01 3.02720070e+00 3.11439991e+00 ... 2.40725493e+00\n", + " -2.04904509e+00 3.36174107e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0]\n", + "Ok!\n", + "7.4363425925925934% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n", + " 2.38160934e+01 3.15938339e+01]\n", + " [-9.55138900e+06 6.71377197e+02 2.06871887e+02 ... -3.86393509e+01\n", + " 2.14816055e+01 -6.58599396e+01]\n", + " [ 1.14522670e+07 2.19898975e+03 -6.89673233e+00 ... 9.51593590e+00\n", + " -1.68612709e+01 6.02474251e+01]\n", + " ...\n", + " [ 2.10891925e+06 3.78648706e+03 1.02701221e+03 ... 3.59794388e+01\n", + " 5.03902206e+01 4.19777756e+01]\n", + " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ... 9.36443710e+00\n", + " 3.84094887e+01 -7.51948738e+00]\n", + " [ 7.39155050e+06 1.11731885e+03 3.38369843e+02 ... 3.70399475e+01\n", + " 1.77629051e+01 9.76780853e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.52523600e+06 -1.27625293e+03 -4.39336700e+01 ... -3.34414597e+01\n", + " 2.38162422e+01 3.15938187e+01]\n", + " [-9.55138900e+06 6.71377319e+02 2.06871674e+02 ... -3.86393127e+01\n", + " 2.14817867e+01 -6.58600464e+01]\n", + " [ 1.14522660e+07 2.19898950e+03 -6.89660644e+00 ... 9.51594448e+00\n", + " -1.68611774e+01 6.02474518e+01]\n", + " ...\n", + " [ 2.10891850e+06 3.78648633e+03 1.02701196e+03 ... 3.59794846e+01\n", + " 5.03901253e+01 4.19777679e+01]\n", + " [ 2.11695400e+06 -2.36282440e+02 -1.08002762e+02 ... 9.36448860e+00\n", + " 3.84096107e+01 -7.51954842e+00]\n", + " [ 7.39155000e+06 1.11731921e+03 3.38370087e+02 ... 3.70398293e+01\n", + " 1.77627277e+01 9.76782227e+01]]\n", + "6.011284722222222% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 4.9057606e+01\n", + " 4.7770348e+01 5.8564331e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 4.90575981e+01\n", + " 4.77703362e+01 5.85643845e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750 900 1198 1249\n", + " 1287 1305 1414 1428 1490 1588 1600 1612 1625 1657 1676 1677\n", + " 1692 1694 1724 1730 1772 1822 1825 1838 1853 1910 2035 2043\n", + " 2053 2059 2073 2078 2123 2145 2214 2238 2241 2285 2292 2389\n", + " 2542 2582 2589 2599 2674 2688 2711 2840 2856 2961 2963 2980\n", + " 3064 3176 3192 3255 3262 3278 3338 3341 3412 3419 3492 3590\n", + " 3624 3646 3657 3807 3840 3842 3846 3883 3887 4005 4049 4071\n", + " 4076 4077 4079 4137 4142 4192 4193 4202 4218 4224 4273 4355\n", + " 4358 4381 4401 4435 4469 4499 4514 4546 4598 4619 4747 4846\n", + " 4872 4916 4952 4966 5016 5067 5107 5112 5116 5194 5225 5350\n", + " 5364 5403 5515 5537 5550 5578 5650 5653 5654 5736 5751 5837\n", + " 5870 5881 5972 5998 6006 6051 6061 6107 6129 6204 6236 6292\n", + " 6296 6327 6382 6393 6403 6420 6424 6436 6468 6542 6599 6675\n", + " 6681 6711 6723 6767 6823 6914 6983 7047 7064 7133 7167 7197\n", + " 7198 7209 7528 7537 7538 7686 7850 7855 7889 7910 7919 7927\n", + " 7937 7939 8089 8101 8157 8169 8175 8223 8292 8304 8306 8342\n", + " 8351 8414 8475 8500 8543 8558 8609 8656 8687 8704 8724 8726\n", + " 8777 8816 8826 8871 8904 8934 8983 9012 9033 9043 9068 9093\n", + " 9125 9133 9144 9151 9154 9217 9222 9320 9335 9367 9398 9421\n", + " 9434 9521 9547 9633 9702 9726 9763 9949 10018 10053 10062 10079\n", + " 10137 10149 10203 10261 10269 10292 10312 10332 10471 10478 10514 10596\n", + " 10645 10676 10678 10781 10795 10810 10833 10891 10904 10935 10957 10977\n", + " 10982 11028 11095 11172 11223 11251 11283 11303 11319 11374 11392 11437\n", + " 11486 11627 11678 11750 11759 11979 11996 12019 12126 12237 12262 12288\n", + " 12303 12309 12315 12387 12543 12569 12613 12648 12786 12852 12866 12879\n", + " 12947 12963 13037 13058 13261 13284 13312 13394 13399 13427 13526 13527\n", + " 13592 13695 13741 13752 13775 13803 13812 13866 13902 14049 14170 14241\n", + " 14354 14382 14426 14451 14455 14486 14502 14582 14820 14934 14961 14976\n", + " 15000 15003 15014 15077 15096 15108 15135 15148 15165 15219 15232 15290\n", + " 15339 15345 15819 15945 15994 16077 16135 16218 16231 16233 16239 16243\n", + " 16295 16311 16339 16356 16366 16417 16456 16498 16502 16503 16506 16547\n", + " 16585 16603 16611 16633 16661 16683 16704 16710 16723 16724 16745 16754\n", + " 16773 16787 16789 16818 16829 16833 16913 16933 17025 17033 17037 17055\n", + " 17084 17098 17109 17176 17225 17240 17292 17294 17339 17390 17427 17437\n", + " 17579 17626 17630 17654 17719 17902 17912 18023 18025 18124 18203 18339\n", + " 18344]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 4.9057606e+01\n", + " 4.7770348e+01 5.8564331e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 4.90575981e+01\n", + " 4.77703362e+01 5.85643845e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 4.7819588e+07 3.8833264e+07 4.7789860e+07 ... 1.0804405e+00\n", + " 2.7186510e-01 -2.9918199e+00]\n", + "FF:[ 4.78195960e+07 3.88332640e+07 4.77898600e+07 ... 1.08044124e+00\n", + " 2.71864563e-01 -2.99182224e+00]\n", + "[ True True True ... True True True]\n", + "[ 109 211 312 422 590 832 835 1016 1053 1076 1268 1353 1374 1693\n", + " 1701 1710 1722 1832 1954 1965 1997 2076 2124 2146 2378 2520 2605 2624\n", + " 2967 3007 3015]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 3.3558659e+09 1.3409817e+10 -1.4671958e+10 ... 7.2100967e+01\n", + " 6.5979071e+00 -2.1230124e+01]\n", + "FF:[ 3.35586406e+09 1.34098166e+10 -1.46719611e+10 ... 7.21009750e+01\n", + " 6.59790993e+00 -2.12301121e+01]\n", + "[ True True True ... True True True]\n", + "[ 4 95 111 163 179 191 279 305 363 406 447 487 489 494\n", + " 517 617 703 713 735 796 805 819 826 858 882 959 964 967\n", + " 986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n", + " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n", + " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n", + " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_input_0\n", + "HF: [ 3.3558659e+09 1.3409817e+10 -1.4671958e+10 ... 7.2100967e+01\n", + " 6.5979071e+00 -2.1230124e+01]\n", + "FF:[ 3.35586406e+09 1.34098166e+10 -1.46719611e+10 ... 7.21009750e+01\n", + " 6.59790993e+00 -2.12301121e+01]\n", + "[ True True True ... True True True]\n", + "[ 4 95 111 163 179 191 279 305 363 406 447 487 489 494\n", + " 517 617 703 713 735 796 805 819 826 858 882 959 964 967\n", + " 986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n", + " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n", + " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n", + " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 9.3464905e+01\n", + " 7.5613129e+01 7.6598846e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 9.34649200e+01\n", + " 7.56131058e+01 7.65989227e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_o_proj_in_grad\n", + "HF: [-9.4470595e+09 -7.3870331e+09 1.2659395e+10 ... -2.8149616e+01\n", + " 1.7019112e+02 -7.7236428e+00]\n", + "FF:[-9.44706150e+09 -7.38703309e+09 1.26593966e+10 ... -2.81496239e+01\n", + " 1.70191177e+02 -7.72364044e+00]\n", + "[ True True True ... True True True]\n", + "[ 11 98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-9.44705946e+09 2.28078384e+01 3.18554016e+02 ... 1.17267204e+02\n", + " 2.06791725e+01 1.13138672e+02]\n", + " [-7.38703309e+09 -7.36898804e+00 7.93705673e+01 ... 2.04039650e+01\n", + " 3.18331490e+01 5.44241562e+01]\n", + " [ 1.26593946e+10 1.77534424e+02 -2.97175941e+01 ... 1.16716766e+01\n", + " 7.70214081e+01 2.81902496e+02]\n", + " ...\n", + " [ 4.51210445e+10 3.63867615e+02 -8.04915466e+01 ... -1.34332123e+02\n", + " -1.22151840e+02 -2.81496162e+01]\n", + " [-1.39591885e+10 1.59216873e+02 6.11343079e+01 ... 1.56675262e+02\n", + " 9.68551483e+01 1.70191116e+02]\n", + " [-1.29442345e+10 -2.39441833e+02 2.73647644e+02 ... -4.41197014e+01\n", + " -9.48526230e+01 -7.72364283e+00]]\n", + "FF:[[-9.44706150e+09 2.28079376e+01 3.18553864e+02 ... 1.17267227e+02\n", + " 2.06791859e+01 1.13138741e+02]\n", + " [-7.38703309e+09 -7.36921692e+00 7.93703690e+01 ... 2.04038925e+01\n", + " 3.18332825e+01 5.44241333e+01]\n", + " [ 1.26593966e+10 1.77534454e+02 -2.97174206e+01 ... 1.16717224e+01\n", + " 7.70213699e+01 2.81902618e+02]\n", + " ...\n", + " [ 4.51210527e+10 3.63867554e+02 -8.04915695e+01 ... -1.34332092e+02\n", + " -1.22151901e+02 -2.81496239e+01]\n", + " [-1.39591834e+10 1.59216995e+02 6.11343040e+01 ... 1.56675293e+02\n", + " 9.68551559e+01 1.70191177e+02]\n", + " [-1.29442304e+10 -2.39441772e+02 2.73647644e+02 ... -4.41196594e+01\n", + " -9.48526916e+01 -7.72364044e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 11 98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-9.44705946e+09 -7.38703309e+09 1.26593946e+10 ... 4.51210445e+10\n", + " -1.39591885e+10 -1.29442345e+10]\n", + " [ 1.14852783e+03 4.39543152e+02 1.07877356e+03 ... -2.42416113e+03\n", + " 2.64504834e+03 4.68633453e+02]\n", + " [ 5.72417107e+01 4.12602806e+01 -2.27319489e+01 ... -3.40788422e+01\n", + " 4.86237946e+01 1.25752163e+01]\n", + " ...\n", + " [ 6.76848269e+00 8.23165894e+00 2.10253639e+01 ... -3.19590777e-01\n", + " 3.68098617e-01 -1.95310101e-01]\n", + " [ 4.08574820e+00 5.33035660e+00 1.41003275e+01 ... -1.35607815e+00\n", + " 4.06074905e+00 -7.67630756e-01]\n", + " [ 2.03186665e+01 9.77407932e+00 5.06271019e+01 ... -6.80029154e-01\n", + " 4.11142111e+00 -1.86585218e-01]]\n", + "FF:[[-9.44706150e+09 -7.38703309e+09 1.26593966e+10 ... 4.51210527e+10\n", + " -1.39591834e+10 -1.29442304e+10]\n", + " [ 1.14852808e+03 4.39542755e+02 1.07877344e+03 ... -2.42416138e+03\n", + " 2.64504932e+03 4.68633698e+02]\n", + " [ 5.72415771e+01 4.12602005e+01 -2.27318707e+01 ... -3.40787392e+01\n", + " 4.86236725e+01 1.25752039e+01]\n", + " ...\n", + " [ 6.76847696e+00 8.23167515e+00 2.10253181e+01 ... -3.19590837e-01\n", + " 3.68098557e-01 -1.95310280e-01]\n", + " [ 4.08574867e+00 5.33037567e+00 1.41003180e+01 ... -1.35607564e+00\n", + " 4.06074095e+00 -7.67629445e-01]\n", + " [ 2.03186874e+01 9.77407932e+00 5.06271439e+01 ... -6.80029511e-01\n", + " 4.11142349e+00 -1.86585203e-01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "6.640625% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-5.1505955e+10 -4.7166772e+03 -1.3288132e+02 ... -3.0123844e+00\n", + " -5.5234032e+01 6.0299168e+00]\n", + " [-3.5960029e+10 -5.3263096e+03 -1.9434322e+02 ... -5.6601189e+01\n", + " -1.0787462e+02 -6.0718418e+01]\n", + " [ 4.8131662e+10 1.1578307e+04 1.7744476e+02 ... -5.6970375e+01\n", + " -1.7497168e+01 -7.2297249e+00]\n", + " ...\n", + " [-9.0346426e+08 6.4752144e+03 3.2408417e+02 ... 6.1075470e+01\n", + " 8.5356834e+01 8.3221588e+01]\n", + " [-5.0754217e+09 -2.2929268e+03 -1.4913528e+02 ... 8.6639397e+01\n", + " 1.1156468e+02 1.0695674e+02]\n", + " [ 5.5844772e+09 3.0225920e+03 -6.3137859e+01 ... -6.5270996e+01\n", + " 8.2730171e+01 -1.0107367e+02]]\n", + "ff_attn_in: (768, 24)\n", + "[[-5.15059548e+10 -4.71667773e+03 -1.32881012e+02 ... -3.01225996e+00\n", + " -5.52339973e+01 6.02991867e+00]\n", + " [-3.59600292e+10 -5.32630957e+03 -1.94343079e+02 ... -5.66010437e+01\n", + " -1.07874649e+02 -6.07182846e+01]\n", + " [ 4.81316659e+10 1.15783076e+04 1.77444519e+02 ... -5.69703102e+01\n", + " -1.74972763e+01 -7.22990799e+00]\n", + " ...\n", + " [-9.03455232e+08 6.47521484e+03 3.24083832e+02 ... 6.10753632e+01\n", + " 8.53567886e+01 8.32217255e+01]\n", + " [-5.07543654e+09 -2.29292749e+03 -1.49135025e+02 ... 8.66392517e+01\n", + " 1.11564789e+02 1.06956917e+02]\n", + " [ 5.58446592e+09 3.02259229e+03 -6.31376152e+01 ... -6.52709351e+01\n", + " 8.27302551e+01 -1.01073837e+02]]\n", + "7.025824652777778% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_output_0\n", + "HF: [-6.33203254e+13 -4.43651289e+13 6.35509366e+13 ... 1.08435585e+02\n", + " 9.42303467e+01 5.89958420e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 1.08435623e+02\n", + " 9.42303467e+01 5.89958954e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415\n", + " 428 482 492 514 526 531 671 731 763 777 893 927\n", + " 984 1105 1184 1206 1418 1541 1548 1572 1577 1613 1619 1643\n", + " 1658 1661 1691 1701 1706 1726 1757 1784 1815 1833 1849 1856\n", + " 1880 1891 1921 1956 1969 2012 2021 2028 2030 2059 2065 2144\n", + " 2149 2183 2210 2238 2292 2342 2357 2384 2414 2495 2531 2565\n", + " 2597 2662 2713 2781 2821 2829 2877 2904 2921 2927 2962 2973\n", + " 3044 3066 3094 3100 3106 3159 3193 3251 3377 3389 3397 3427\n", + " 3436 3570 3594 3703 3729 3770 3772 3780 3811 3840 3842 3860\n", + " 3907 3920 3929 3946 3955 3969 4005 4009 4034 4048 4077 4089\n", + " 4104 4129 4134 4178 4202 4212 4219 4239 4245 4256 4273 4373\n", + " 4407 4463 4464 4465 4481 4511 4537 4541 4543 4549 4597 4599\n", + " 4633 4759 4760 4789 4846 4884 4901 4930 4954 4971 4993 5024\n", + " 5030 5041 5050 5116 5130 5163 5207 5224 5282 5313 5322 5349\n", + " 5363 5403 5410 5412 5454 5543 5581 5590 5654 5673 5784 5821\n", + " 5849 5880 5911 5917 5982 6000 6062 6165 6178 6193 6200 6272\n", + " 6322 6351 6366 6376 6380 6382 6393 6412 6420 6430 6433 6446\n", + " 6476 6482 6488 6490 6519 6527 6540 6556 6563 6567 6577 6600\n", + " 6619 6680 6709 6735 6768 6777 6780 6823 6825 6826 6830 6863\n", + " 6880 6912 6988 7006 7030 7071 7077 7102 7123 7244 7264 7367\n", + " 7389 7390 7434 7451 7452 7455 7505 7532 7539 7589 7598 7620\n", + " 7651 7653 7659 7709 7714 7740 7751 7759 7803 7808 7820 7917\n", + " 7923 7926 7949 7962 7966 7978 8002 8004 8040 8050 8052 8068\n", + " 8180 8223 8250 8253 8265 8341 8344 8375 8376 8386 8449 8468\n", + " 8501 8509 8522 8535 8585 8590 8593 8642 8657 8674 8687 8707\n", + " 8714 8726 8729 8737 8756 8769 8801 8846 8850 8865 8907 8998\n", + " 9018 9043 9059 9066 9083 9093 9098 9130 9131 9165 9189 9216\n", + " 9285 9337 9368 9526 9539 9563 9620 9659 9723 9793 9804 9817\n", + " 9820 9827 9908 9995 10053 10128 10135 10143 10205 10253 10274 10292\n", + " 10300 10311 10327 10356 10406 10441 10491 10494 10551 10562 10563 10634\n", + " 10649 10674 10710 10734 10821 10831 10833 10838 10845 10911 10966 10981\n", + " 10988 10990 10998 11008 11044 11049 11100 11127 11141 11197 11250 11269\n", + " 11285 11308 11361 11383 11437 11460 11494 11502 11511 11522 11546 11557\n", + " 11564 11588 11649 11658 11671 11674 11703 11729 11749 11759 11832 11892\n", + " 11979 11988 12000 12038 12063 12078 12107 12119 12165 12259 12269 12270\n", + " 12347 12369 12386 12415 12475 12518 12566 12569 12574 12652 12693 12792\n", + " 12833 12834 12852 12872 12900 12946 13117 13121 13124 13321 13345 13357\n", + " 13427 13431 13446 13473 13526 13635 13638 13662 13706 13733 13803 13807\n", + " 13852 13882 13912 13924 13962 13969 13986 14023 14036 14046 14085 14110\n", + " 14130 14141 14175 14183 14191 14220 14222 14223 14285 14310 14331 14336\n", + " 14354 14375 14425 14427 14451 14482 14493 14516 14560 14563 14581 14623\n", + " 14671 14677 14679 14680 14685 14688 14742 14799 14860 14868 14870 14872\n", + " 14900 14909 14916 14940 14964 14991 15003 15023 15027 15033 15038 15051\n", + " 15086 15100 15184 15214 15232 15290 15352 15363 15365 15407 15433 15451\n", + " 15522 15577 15707 15720 15725 15739 15830 15837 15875 15937 15965 15985\n", + " 16017 16054 16113 16136 16142 16169 16191 16232 16238 16250 16268 16282\n", + " 16285 16290 16295 16304 16327 16334 16353 16356 16363 16382 16403 16407\n", + " 16408 16409 16458 16459 16495 16497 16499 16500 16516 16532 16595 16603\n", + " 16611 16657 16678 16680 16695 16701 16704 16754 16768 16807 16818 16856\n", + " 16870 16951 16971 16986 16989 16992 17048 17134 17181 17208 17217 17236\n", + " 17243 17319 17363 17398 17448 17471 17497 17557 17646 17654 17659 17692\n", + " 17754 17947 17957 17969 17975 18029 18128 18146 18196 18206 18207 18250\n", + " 18265 18313 18406]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-6.33203254e+13 -4.43651289e+13 6.35509366e+13 ... 1.08435585e+02\n", + " 9.42303467e+01 5.89958420e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 1.08435623e+02\n", + " 9.42303467e+01 5.89958954e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n", + " 671 731 763]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 5.0590863e+10 3.7823513e+11 -5.0394451e+11 ... -5.5814421e-01\n", + " 2.2970559e-01 -1.2293311e+00]\n", + "FF:[ 5.05906831e+10 3.78235290e+11 -5.03944544e+11 ... -5.58144033e-01\n", + " 2.29705781e-01 -1.22933090e+00]\n", + "[ True True True ... True True True]\n", + "[ 189 254 317 418 515 546 577 634 636 675 712 808 1011 1030\n", + " 1080 1091 1132 1168 1254 1265 1285 1287 1354 1381 1427 1459 1506 1620\n", + " 1654 1752 1887 1897 1900 1937 1981 1985 1986 2003 2029 2152 2181 2295\n", + " 2395 2426 2445 2673 2687 2859 2947 2977 3037]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n", + " 1.2655228e+01 7.1020460e+00]\n", + "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n", + " 1.26551876e+01 7.10206795e+00]\n", + "[ True True True ... True True True]\n", + "[ 9 49 113 174 243 267 271 288 323 335 397 399 438 439\n", + " 457 475 506 568 569 652 680 689 715 735 739 758 766 777\n", + " 785 837 842 852 865 884 893 919 930 932 936 939 957 1018\n", + " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n", + " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n", + " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n", + " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n", + " 2932 2948 3018 3034 3066]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_input_0\n", + "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n", + " 1.2655228e+01 7.1020460e+00]\n", + "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n", + " 1.26551876e+01 7.10206795e+00]\n", + "[ True True True ... True True True]\n", + "[ 9 49 113 174 243 267 271 288 323 335 397 399 438 439\n", + " 457 475 506 568 569 652 680 689 715 735 739 758 766 777\n", + " 785 837 842 852 865 884 893 919 930 932 936 939 957 1018\n", + " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n", + " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n", + " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n", + " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n", + " 2932 2948 3018 3034 3066]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_output_0\n", + "HF: [-6.3320325e+13 -4.4365129e+13 6.3550937e+13 ... 7.2449814e+01\n", + " 8.6617142e+01 8.3981407e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 7.24498901e+01\n", + " 8.66170959e+01 8.39814606e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n", + " 671 731 763]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_o_proj_in_grad\n", + "HF: [ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... 2.5297220e+02\n", + " -8.1722275e+01 -7.0014725e+01]\n", + "FF:[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... 2.52972260e+02\n", + " -8.17222137e+01 -7.00146637e+01]\n", + "[ True True True ... True True True]\n", + "[ 6 36 43 55 60 82 101 110 117 217 221 229 236 256 289 392 421 429\n", + " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 7.28854608e+13 6.37500977e+02 2.96775421e+02 ... 8.35403061e+01\n", + " 1.72460327e+02 2.90482426e+01]\n", + " [-6.08358210e+13 -5.23222847e+01 -2.34542664e+02 ... -1.87500763e+01\n", + " -8.99429398e+01 8.64021378e+01]\n", + " [-7.97326117e+13 -4.24736328e+02 -1.82208099e+02 ... 3.21808720e+00\n", + " -5.87415466e+01 -2.08511108e+02]\n", + " ...\n", + " [-1.13411917e+14 -3.48418640e+02 1.52205795e+02 ... 1.51519928e+02\n", + " 2.45651031e+02 2.52972198e+02]\n", + " [-3.75985275e+12 2.39696625e+02 1.51989685e+02 ... -2.85605354e+01\n", + " -1.79121232e+00 -8.17222748e+01]\n", + " [ 1.11016038e+14 -1.96372967e+01 -1.27668396e+02 ... 3.35008011e+01\n", + " -7.46116943e+01 -7.00147247e+01]]\n", + "FF:[[ 7.28854608e+13 6.37500977e+02 2.96775513e+02 ... 8.35403976e+01\n", + " 1.72460068e+02 2.90483646e+01]\n", + " [-6.08357832e+13 -5.23225098e+01 -2.34542755e+02 ... -1.87501526e+01\n", + " -8.99431992e+01 8.64022217e+01]\n", + " [-7.97326201e+13 -4.24736572e+02 -1.82207733e+02 ... 3.21793270e+00\n", + " -5.87416573e+01 -2.08511139e+02]\n", + " ...\n", + " [-1.13411925e+14 -3.48418640e+02 1.52205902e+02 ... 1.51519714e+02\n", + " 2.45650864e+02 2.52972260e+02]\n", + " [-3.75988630e+12 2.39696686e+02 1.51989319e+02 ... -2.85606136e+01\n", + " -1.79138493e+00 -8.17222137e+01]\n", + " [ 1.11016046e+14 -1.96372318e+01 -1.27668480e+02 ... 3.35009079e+01\n", + " -7.46116791e+01 -7.00146637e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 6 36 43 55 60 82 101 110 117 217 221 229 236 256 289 392 421 429\n", + " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... -1.1341192e+14\n", + " -3.7598527e+12 1.1101604e+14]\n", + " [ 3.3241980e+03 -6.3044128e+02 -3.0447307e+03 ... 3.0137921e+02\n", + " 3.8262988e+02 -4.2889914e+02]\n", + " [ 3.5639046e+01 -1.6155790e+01 -2.4461178e+01 ... 2.7450909e+02\n", + " 1.6181946e+02 -2.5407137e+02]\n", + " ...\n", + " [ 4.6487908e+00 -9.6633381e-01 -2.7078497e-01 ... 3.6374569e+01\n", + " -1.7563061e+00 -7.1206141e+00]\n", + " [ 1.8901447e+00 8.9006472e-01 -4.3125896e+00 ... 2.6014965e+01\n", + " -3.7720141e-01 -7.8855257e+00]\n", + " [ 1.9513500e+00 5.8041654e+00 -1.4006979e+01 ... 7.2743622e+01\n", + " -2.3499712e+01 -2.0133139e+01]]\n", + "FF:[[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... -1.13411925e+14\n", + " -3.75988630e+12 1.11016046e+14]\n", + " [ 3.32419922e+03 -6.30442505e+02 -3.04472998e+03 ... 3.01379364e+02\n", + " 3.82629669e+02 -4.28898712e+02]\n", + " [ 3.56390572e+01 -1.61558037e+01 -2.44611683e+01 ... 2.74509308e+02\n", + " 1.61819229e+02 -2.54071594e+02]\n", + " ...\n", + " [ 4.64879847e+00 -9.66338813e-01 -2.70792574e-01 ... 3.63745117e+01\n", + " -1.75632846e+00 -7.12060070e+00]\n", + " [ 1.89013767e+00 8.90062451e-01 -4.31257772e+00 ... 2.60149212e+01\n", + " -3.77217919e-01 -7.88551569e+00]\n", + " [ 1.95135939e+00 5.80417490e+00 -1.40069904e+01 ... 7.27435226e+01\n", + " -2.34996586e+01 -2.01330910e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "7.609953703703703% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-1.17282076e+14 -2.12461621e+03 8.80099030e+01 ... 4.34470520e+01\n", + " 7.55885468e+01 -2.88791332e+01]\n", + " [-2.07757936e+14 -3.81796265e+02 -2.33774780e+02 ... 8.11984329e+01\n", + " -4.41825638e+01 7.35064125e+00]\n", + " [ 4.11484165e+13 2.50572113e+02 1.91601822e+02 ... 1.00269365e+01\n", + " -3.41638985e+01 1.20433075e+02]\n", + " ...\n", + " [ 7.95562329e+13 1.55007373e+03 1.70351212e+02 ... -1.80320053e+01\n", + " 8.77533417e+01 2.14678173e+01]\n", + " [-1.86546485e+14 -5.18847070e+03 -3.34331085e+02 ... 2.51586838e+01\n", + " -4.06135368e+01 -6.27860641e+00]\n", + " [ 1.89751705e+14 -3.09853809e+03 -1.18278351e+01 ... -1.24640663e+02\n", + " 1.59719009e+01 -6.47173615e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-1.17282034e+14 -2.12461694e+03 8.80101547e+01 ... 4.34468918e+01\n", + " 7.55886002e+01 -2.88791542e+01]\n", + " [-2.07757920e+14 -3.81795776e+02 -2.33774765e+02 ... 8.11985397e+01\n", + " -4.41825829e+01 7.35066986e+00]\n", + " [ 4.11484543e+13 2.50570099e+02 1.91601196e+02 ... 1.00270777e+01\n", + " -3.41638451e+01 1.20433121e+02]\n", + " ...\n", + " [ 7.95562413e+13 1.55007288e+03 1.70350784e+02 ... -1.80321960e+01\n", + " 8.77533112e+01 2.14678249e+01]\n", + " [-1.86546469e+14 -5.18847070e+03 -3.34331268e+02 ... 2.51588135e+01\n", + " -4.06132622e+01 -6.27861023e+00]\n", + " [ 1.89751521e+14 -3.09853711e+03 -1.18275299e+01 ... -1.24640862e+02\n", + " 1.59719791e+01 -6.47173767e+01]]\n", + "7.530381944444445% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 7.7873253e+01\n", + " 8.6085976e+01 6.8200005e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 7.78733292e+01\n", + " 8.60859299e+01 6.82000580e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181\n", + " 226 261 284 318 320 378 382 385 391 395 403 422\n", + " 434 495 515 523 524 549 579 610 644 710 764 772\n", + " 870 984 987 1045 1249 1330 1362 1489 1517 1550 1556 1588\n", + " 1595 1659 1672 1684 1689 1768 1792 1799 1808 1818 1842 1871\n", + " 1889 1899 1910 1915 1925 1936 1993 1997 2033 2041 2059 2062\n", + " 2066 2098 2111 2124 2129 2130 2146 2153 2159 2166 2197 2206\n", + " 2210 2212 2222 2234 2237 2320 2321 2357 2359 2362 2385 2428\n", + " 2518 2539 2553 2568 2598 2683 2689 2694 2711 2714 2733 2787\n", + " 2788 2795 2811 2815 2853 2881 2890 2917 2981 2997 3021 3037\n", + " 3089 3149 3163 3191 3196 3217 3225 3248 3277 3287 3292 3305\n", + " 3327 3361 3385 3402 3417 3425 3456 3479 3516 3521 3528 3555\n", + " 3587 3599 3608 3684 3702 3733 3770 3779 3819 3822 3823 3898\n", + " 3921 3942 3950 4012 4053 4077 4086 4091 4139 4185 4198 4225\n", + " 4241 4296 4347 4349 4368 4403 4407 4418 4453 4471 4472 4473\n", + " 4494 4537 4549 4555 4558 4598 4623 4648 4666 4698 4729 4782\n", + " 4848 4866 4886 4943 4959 5008 5010 5012 5057 5079 5177 5178\n", + " 5186 5211 5271 5281 5296 5313 5328 5356 5364 5409 5429 5440\n", + " 5453 5455 5457 5476 5529 5563 5591 5621 5625 5631 5654 5661\n", + " 5692 5705 5720 5740 5751 5758 5787 5799 5813 5835 5836 5867\n", + " 5872 5893 5953 5974 5980 5982 6000 6055 6082 6086 6102 6107\n", + " 6123 6159 6172 6193 6220 6230 6231 6263 6286 6297 6362 6396\n", + " 6401 6430 6436 6485 6497 6499 6502 6510 6537 6554 6555 6563\n", + " 6564 6579 6586 6598 6615 6625 6626 6649 6651 6661 6754 6764\n", + " 6776 6852 6863 6874 6883 6892 6913 6945 6969 7036 7057 7066\n", + " 7082 7138 7147 7150 7157 7197 7202 7231 7234 7235 7240 7270\n", + " 7278 7287 7322 7327 7345 7348 7361 7390 7402 7490 7539 7573\n", + " 7610 7714 7721 7758 7794 7812 7827 7829 7837 7839 7882 7894\n", + " 7943 7948 7952 7969 7975 7996 8024 8027 8037 8043 8055 8078\n", + " 8079 8088 8090 8095 8154 8258 8264 8283 8297 8313 8329 8336\n", + " 8359 8361 8376 8383 8416 8421 8428 8454 8475 8502 8521 8613\n", + " 8642 8653 8696 8756 8764 8777 8791 8837 8849 8859 8878 8955\n", + " 8991 8997 9006 9012 9040 9066 9093 9097 9098 9131 9158 9162\n", + " 9165 9214 9216 9280 9297 9301 9316 9355 9371 9412 9421 9475\n", + " 9510 9580 9620 9645 9696 9713 9732 9768 9802 9817 9819 9826\n", + " 9839 9846 9947 10004 10062 10065 10072 10103 10107 10108 10138 10167\n", + " 10173 10228 10262 10292 10326 10356 10360 10372 10421 10446 10466 10468\n", + " 10499 10505 10513 10517 10589 10606 10612 10645 10664 10669 10726 10777\n", + " 10835 10838 10839 10848 10855 10877 10897 10941 10963 10971 10977 10997\n", + " 11030 11060 11065 11076 11088 11140 11167 11174 11231 11252 11257 11259\n", + " 11275 11297 11302 11319 11331 11333 11357 11358 11380 11382 11402 11423\n", + " 11446 11447 11500 11501 11522 11585 11623 11670 11728 11736 11759 11761\n", + " 11772 11785 11839 11894 11916 11924 11936 11962 11968 11969 11977 11984\n", + " 12008 12030 12054 12074 12123 12175 12182 12194 12237 12262 12282 12285\n", + " 12341 12348 12351 12370 12376 12386 12399 12449 12507 12513 12518 12522\n", + " 12549 12572 12643 12648 12663 12689 12696 12710 12769 12780 12788 12792\n", + " 12793 12852 12864 12879 12884 12985 13018 13041 13057 13176 13264 13272\n", + " 13274 13275 13292 13303 13333 13379 13427 13428 13442 13451 13454 13500\n", + " 13510 13533 13564 13588 13607 13640 13655 13686 13687 13688 13732 13747\n", + " 13786 13801 13803 13826 13841 13846 13850 13892 13909 13946 14036 14040\n", + " 14046 14060 14080 14152 14161 14183 14195 14210 14240 14278 14331 14354\n", + " 14370 14372 14386 14395 14409 14432 14434 14497 14506 14531 14559 14589\n", + " 14648 14663 14686 14698 14715 14743 14757 14799 14808 14810 14849 14893\n", + " 14902 14929 14937 14947 14953 14958 15005 15012 15018 15036 15066 15069\n", + " 15083 15152 15154 15196 15197 15212 15292 15309 15323 15340 15343 15375\n", + " 15389 15396 15408 15410 15454 15499 15532 15557 15605 15647 15677 15736\n", + " 15745 15756 15769 15809 15824 15876 15882 15900 15906 15941 16027 16030\n", + " 16040 16116 16190 16192 16205 16207 16239 16279 16285 16295 16348 16358\n", + " 16367 16384 16386 16394 16399 16455 16457 16458 16471 16495 16500 16502\n", + " 16520 16541 16542 16598 16623 16643 16651 16665 16673 16679 16713 16725\n", + " 16734 16736 16739 16751 16756 16768 16861 16870 16939 16976 17007 17028\n", + " 17040 17069 17087 17108 17125 17139 17151 17158 17174 17175 17178 17182\n", + " 17189 17221 17258 17341 17360 17370 17381 17395 17396 17415 17432 17450\n", + " 17463 17470 17472 17473 17496 17507 17536 17608 17626 17627 17649 17653\n", + " 17664 17771 17815 17822 17831 17864 17883 17931 17994 17999 18035 18174\n", + " 18209 18250 18274 18307 18327 18403 18423]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 7.7873253e+01\n", + " 8.6085976e+01 6.8200005e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 7.78733292e+01\n", + " 8.60859299e+01 6.82000580e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181 226 261 284 318 320 378\n", + " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 6.5550952e+14 4.9376585e+14 3.8510841e+14 ... 1.6802770e+00\n", + " -1.1248941e+00 -1.1701980e+00]\n", + "FF:[ 6.55509317e+14 4.93765882e+14 3.85108377e+14 ... 1.68027747e+00\n", + " -1.12489426e+00 -1.17019880e+00]\n", + "[ True True True ... True True True]\n", + "[ 6 79 111 149 155 168 187 195 220 223 252 261 329 343\n", + " 347 369 386 392 403 438 439 450 461 524 535 643 656 659\n", + " 661 668 722 727 732 742 754 801 816 820 835 837 849 850\n", + " 978 993 997 1012 1019 1034 1044 1071 1088 1094 1114 1135 1151 1170\n", + " 1190 1212 1273 1275 1277 1289 1290 1308 1311 1337 1364 1379 1394 1430\n", + " 1454 1460 1469 1474 1703 1725 1728 1732 1733 1741 1754 1757 1804 1806\n", + " 1856 1862 1932 1945 1996 2030 2044 2045 2065 2071 2075 2094 2149 2152\n", + " 2163 2180 2182 2215 2254 2357 2362 2370 2392 2398 2428 2484 2519 2521\n", + " 2524 2582 2618 2641 2645 2664 2674 2681 2691 2735 2747 2779 2872 2899\n", + " 2909 2935 2957 3000 3033]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [-1.3871785e+17 -8.3164397e+16 4.9509505e+16 ... 4.3806694e+01\n", + " 9.4386072e+00 -2.4460859e+01]\n", + "FF:[-1.38717840e+17 -8.31644654e+16 4.95094495e+16 ... 4.38065948e+01\n", + " 9.43864822e+00 -2.44608364e+01]\n", + "[ True True True ... True True True]\n", + "[ 80 83 172 173 176 184 215 285 329 338 341 395 403 465\n", + " 468 565 572 601 614 636 639 651 660 749 750 806 828 844\n", + " 873 952 971 988 992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n", + " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n", + " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n", + " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n", + " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n", + " 2936 2945 2951 3013 3016]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_input_0\n", + "HF: [-1.3871785e+17 -8.3164397e+16 4.9509505e+16 ... 4.3806694e+01\n", + " 9.4386072e+00 -2.4460859e+01]\n", + "FF:[-1.38717840e+17 -8.31644654e+16 4.95094495e+16 ... 4.38065948e+01\n", + " 9.43864822e+00 -2.44608364e+01]\n", + "[ True True True ... True True True]\n", + "[ 80 83 172 173 176 184 215 285 329 338 341 395 403 465\n", + " 468 565 572 601 614 636 639 651 660 749 750 806 828 844\n", + " 873 952 971 988 992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n", + " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n", + " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n", + " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n", + " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n", + " 2936 2945 2951 3013 3016]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 3.5121140e+01\n", + " -3.5587997e+00 9.5641022e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 3.51211472e+01\n", + " -3.55898285e+00 9.56410980e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181 226 261 284 318 320 378\n", + " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_o_proj_in_grad\n", + "HF: [-1.6186993e+17 -3.5698813e+17 3.4442975e+16 ... -2.5844165e+02\n", + " 2.0677340e+01 -2.4573349e+01]\n", + "FF:[-1.61869621e+17 -3.56988336e+17 3.44430865e+16 ... -2.58441467e+02\n", + " 2.06775093e+01 -2.45735531e+01]\n", + "[ True True True ... True True True]\n", + "[ 93 99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n", + " 608 702 720 731 759]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-1.6186993e+17 -2.1968115e+02 8.5754425e+01 ... -6.9909119e+01\n", + " -2.6478451e+01 -7.4195160e+01]\n", + " [-3.5698813e+17 3.9582391e+02 5.5431940e+02 ... 1.9529277e+02\n", + " 1.2558211e+02 6.7965935e+01]\n", + " [ 3.4442975e+16 2.8310864e+02 -8.1522171e+01 ... -2.3606525e+01\n", + " -2.0410315e+01 -1.5228156e+02]\n", + " ...\n", + " [ 4.0923264e+16 -2.4507169e+02 -8.2614380e+02 ... -2.6583340e+02\n", + " -1.9878247e+02 -2.5844165e+02]\n", + " [ 6.9156258e+17 1.3969666e+02 -7.5639044e+02 ... -1.5231053e+02\n", + " -3.3650037e+02 2.0677340e+01]\n", + " [ 9.9511712e+16 -3.2348724e+01 3.0624988e+02 ... 1.0391423e+02\n", + " 6.0626881e+01 -2.4573349e+01]]\n", + "FF:[[-1.61869621e+17 -2.19681122e+02 8.57541504e+01 ... -6.99092026e+01\n", + " -2.64783611e+01 -7.41952515e+01]\n", + " [-3.56988336e+17 3.95823853e+02 5.54319275e+02 ... 1.95292725e+02\n", + " 1.25582062e+02 6.79659348e+01]\n", + " [ 3.44430865e+16 2.83108551e+02 -8.15224686e+01 ... -2.36064014e+01\n", + " -2.04101429e+01 -1.52281570e+02]\n", + " ...\n", + " [ 4.09233933e+16 -2.45071564e+02 -8.26143555e+02 ... -2.65833405e+02\n", + " -1.98782272e+02 -2.58441467e+02]\n", + " [ 6.91562577e+17 1.39696579e+02 -7.56390808e+02 ... -1.52310455e+02\n", + " -3.36500092e+02 2.06775093e+01]\n", + " [ 9.95114373e+16 -3.23486938e+01 3.06250122e+02 ... 1.03914482e+02\n", + " 6.06264191e+01 -2.45735531e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 93 99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n", + " 608 702 720 731 759]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-1.6186993e+17 -3.5698813e+17 3.4442975e+16 ... 4.0923264e+16\n", + " 6.9156258e+17 9.9511712e+16]\n", + " [-5.3483575e+02 2.6249797e+03 -6.7268573e+02 ... -6.1204077e+03\n", + " -4.3047915e+03 -9.5139771e+01]\n", + " [-1.2200641e+01 1.0347147e+02 -2.6777636e+01 ... -1.4766699e+02\n", + " -9.8514114e+01 1.2616925e+01]\n", + " ...\n", + " [-3.2097631e+00 9.1431990e+00 -1.6333975e+00 ... -6.9996667e+00\n", + " -6.4008064e+00 1.9126304e+00]\n", + " [-3.0982289e+00 1.2355285e+01 -3.1715555e+00 ... -4.6754313e+00\n", + " -6.2553053e+00 1.0515085e+00]\n", + " [-2.9516125e+00 2.7038031e+00 -6.0580249e+00 ... -1.6555168e+01\n", + " 1.3245420e+00 -1.5741113e+00]]\n", + "FF:[[-1.61869621e+17 -3.56988336e+17 3.44430865e+16 ... 4.09233933e+16\n", + " 6.91562577e+17 9.95114373e+16]\n", + " [-5.34834961e+02 2.62497900e+03 -6.72686401e+02 ... -6.12040576e+03\n", + " -4.30479297e+03 -9.51402283e+01]\n", + " [-1.22006664e+01 1.03471611e+02 -2.67777309e+01 ... -1.47666946e+02\n", + " -9.85141525e+01 1.26169167e+01]\n", + " ...\n", + " [-3.20977211e+00 9.14321709e+00 -1.63339353e+00 ... -6.99966621e+00\n", + " -6.40081263e+00 1.91262615e+00]\n", + " [-3.09821057e+00 1.23552399e+01 -3.17152786e+00 ... -4.67541933e+00\n", + " -6.25528765e+00 1.05149710e+00]\n", + " [-2.95161533e+00 2.70380235e+00 -6.05802393e+00 ... -1.65551491e+01\n", + " 1.32455230e+00 -1.57412362e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "8.101851851851851% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-7.3778828e+16 1.0956941e+03 1.1773144e+02 ... -4.0466427e+01\n", + " -3.1198654e+01 -1.7603550e+01]\n", + " [-1.2087128e+18 6.9384756e+03 6.1327003e+01 ... 1.5329468e+01\n", + " 7.6757736e+00 -4.5589094e+00]\n", + " [-6.7892266e+17 5.4895034e+03 7.6927376e+01 ... 9.1396770e+00\n", + " 2.3195824e+01 -6.1995559e+00]\n", + " ...\n", + " [ 2.6452032e+17 9.9761787e+03 2.2349066e+02 ... 5.7504387e+01\n", + " -8.6791611e-01 4.6890911e+01]\n", + " [-6.7528534e+16 3.3856902e+03 2.5189743e+02 ... 2.2824722e+01\n", + " 8.7917282e+01 -2.1569672e+01]\n", + " [-2.1779064e+17 5.2511855e+03 6.6282043e+01 ... 9.9689598e+00\n", + " -5.5022659e+00 -3.2573143e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.37791458e+16 1.09569678e+03 1.17731285e+02 ... -4.04664154e+01\n", + " -3.11988506e+01 -1.76035423e+01]\n", + " [-1.20871251e+18 6.93847900e+03 6.13275528e+01 ... 1.53295393e+01\n", + " 7.67594433e+00 -4.55900288e+00]\n", + " [-6.78922523e+17 5.48950342e+03 7.69272308e+01 ... 9.13961220e+00\n", + " 2.31957569e+01 -6.19959354e+00]\n", + " ...\n", + " [ 2.64520284e+17 9.97617871e+03 2.23490509e+02 ... 5.75044785e+01\n", + " -8.67943764e-01 4.68908234e+01]\n", + " [-6.75287400e+16 3.38569165e+03 2.51897339e+02 ... 2.28247147e+01\n", + " 8.79171448e+01 -2.15696106e+01]\n", + " [-2.17790679e+17 5.25118652e+03 6.62821960e+01 ... 9.96885872e+00\n", + " -5.50213098e+00 -3.25731125e+01]]\n", + "9.809027777777777% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.7.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_7_layers_7_feed_forward_w2_shard_0_output_0\n", + "HF: [-7.5522525e+19 -1.3283726e+21 -7.2549753e+20 ... 4.9017162e+01\n", + " -9.7436657e+00 8.5870697e+01]\n", + "FF:[-7.55228501e+19 -1.32837218e+21 -7.25497390e+20 ... 4.90171394e+01\n", + " -9.74382782e+00 8.58707886e+01]\n", + "[ True True True ... True False True]\n", + "[ 19 64 75 ... 18418 18428 18430]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[23], line 95\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- W2 --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 95\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m compare_tensors(hf_w2_weight, ff_w2_weight, tolerance\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m)\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- Lora --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:47\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "attention_tests=True\n", + "for i in range(tot_num_layers-1, -1, -1):\n", + " # HuggingFace filepaths\n", + " hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n", + " hf_BWD_loraB_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0\"\n", + " hf_BWD_loraB_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0\"\n", + " hf_BWD_loraA_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0\"\n", + " hf_BWD_loraA_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0\"\n", + " hf_loraA_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n", + " hf_loraB_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n", + " hf_BWD_lora_dropout_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.go_0\"\n", + " hf_BWD_lora_dropout_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.gi_0\"\n", + " hf_BWD_w2_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0\"\n", + " hf_BWD_w2_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0\"\n", + " hf_w2_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.weight\"\n", + " hf_BWD_w3_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0\"\n", + " hf_BWD_w3_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0\"\n", + " hf_BWD_w1_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0\"\n", + " hf_BWD_w1_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0\"\n", + " hf_BWD_act_fn_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0\"\n", + " hf_BWD_act_fn_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.go_0\"\n", + " hf_BWD_ffn_norm_out = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0\"\n", + " hf_BWD_ffn_norm_in = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0\"\n", + " hf_BWD_attn_out_out = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0\"\n", + " hf_BWD_attn_q_in = f\"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n", + " hf_FWD_w1_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", + " hf_FWD_w3_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\"\n", + " hf_FWD_act_fn_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0\"\n", + " hf_BWD_attn_oproj_in = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n", + " hf_attn_qproj_weight = f\"{hf_path}/layers.{i}.self_attn.q_proj.weight\"\n", + " hf_attn_kproj_weight = f\"{hf_path}/layers.{i}.self_attn.k_proj.weight\"\n", + " hf_attn_vproj_weight = f\"{hf_path}/layers.{i}.self_attn.v_proj.weight\"\n", + " hf_attn_oproj_weight = f\"{hf_path}/layers.{i}.self_attn.o_proj.weight\"\n", + " \n", + " # FlexFlow filepaths\n", + " ff_BWD_w2_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n", + " ff_BWD_w2_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n", + " ff_BWD_w2_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_pre_input_0\"\n", + " ff_w2_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_weight_0\"\n", + " ff_BWD_ssm_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0\"\n", + " ff_BWD_ssm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0\"\n", + " ff_BWD_ssm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1\"\n", + " ff_BWD_w3_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n", + " ff_BWD_w3_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_input_0\"\n", + " ff_BWD_lora_A_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n", + " ff_BWD_lora_B_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n", + " ff_lora_A_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n", + " ff_lora_B_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n", + " ff_BWD_w1_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n", + " ff_BWD_w1_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_input_0\"\n", + " ff_BWD_w1_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_pre_input_0\"\n", + " ff_w1_weight = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_weight_0\"\n", + " ff_BWD_ffn_norm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_0\"\n", + " ff_BWD_ffn_norm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_1\"\n", + " ff_BWD_ffn_norm_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_0\"\n", + " ff_BWD_attn_out = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n", + " ff_BWD_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_input_0\"\n", + " ff_BWD_ssm_cached_w1_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w1_output\"\n", + " ff_BWD_ssm_cached_w3_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w3_output\"\n", + " ff_FWD_w1_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w1_shard_0_output_0\"\n", + " ff_FWD_w3_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w3_shard_0_output_0\"\n", + " ff_FWD_act_fnc_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_act_fn_output\"\n", + " ff_BWD_attn_o_proj_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + " ff_attn_oproj_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_weight_0\"\n", + " \n", + " \n", + " # HuggingFace checks\n", + " print(\"\\nHuggingface checks:\")\n", + " if i == tot_num_layers-1:\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n", + "\n", + " compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n", + " check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n", + " if i == tot_num_layers-1:\n", + " check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n", + "\n", + " # FlexFlow checks\n", + " print(\"\\nFlexFlow checks:\")\n", + " compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n", + " compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n", + " compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n", + " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n", + " \n", + " # HF-FlexFlow checks\n", + " print(\"\\nHuggingface-FlexFlow checks:\")\n", + " print(\"-- W2 --\")\n", + " compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " \n", + " print(\"-- Lora --\")\n", + " compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n", + " compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n", + "\n", + " compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n", + " compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", + " \n", + " print(\"-- W2/W1/W3 --\")\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n", + " compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", + " compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " \n", + " print(\"-- Attention --\")\n", + " compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n", + " hidden_size = 768\n", + " qProjSize = 64\n", + " num_heads = 12\n", + " num_new_tokens = num_tokens = 24\n", + " if attention_tests:\n", + " # compare attn weight tensors\n", + " ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n", + " ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n", + " \n", + " hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n", + " \n", + " assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n", + " \n", + " # Compare attn outproj grad in tensors\n", + " compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n", + " \n", + " ########### Compare value projs grads ######################\n", + " # 1. compare qk prods softmax\n", + " hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.qk_prods_softmax.output_0\"\n", + " ff_attn_qk_prods_softmax = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax\"\n", + " \n", + " hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n", + " ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + "\n", + " for head_idx in range(num_heads):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + " \n", + " # 2. compare attn heads grads\n", + " hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n", + " ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + "\n", + " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n", + " # NEED TO VISUALLY INSPECT\n", + " compare_loaded_tensors(hf_attn_heads_grads, ff_attn_heads_grads)\n", + "\n", + " # 3. vproj grads\n", + " hf_vproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0\"\n", + " ff_vproj_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_v_proj_in_grad\"\n", + "\n", + " hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n", + " ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n", + " compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads)\n", + " \n", + " \n", + " ##############################\n", + " hf_value_states = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.value_states.output_0\"\n", + " hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " # print(hf_value_states.shape)\n", + " ff_value_states = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_vcache\"\n", + " ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n", + " # print(ff_value_states.shape)\n", + " assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n", + " \n", + " \n", + " \n", + " ########## Compare key and query projs grads ##################\n", + " ff_devQKVPRojArray = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n", + " ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n", + " ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n", + " ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n", + " ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n", + " assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n", + "\n", + " # simulate qk_prods_softmax\n", + " ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n", + " ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n", + " ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n", + " ff_value_states = torch.from_numpy(ff_value_states)\n", + " ff_value_states = ff_value_states.permute(1,0,2)\n", + " # print(ff_attn_heads_grads.shape)\n", + " # print(ff_value_states.shape)\n", + " simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n", + " #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n", + " #print(\"Simulated QK prods grads:\")\n", + " #print(simulated_qk_prods_softmax_grads[0,:,:])\n", + "\n", + " # qk prods softmax right before softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.qk_prods_softmax.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " \n", + " mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n", + " # print(hf_qk_prods_softmax2[:2,:,0])\n", + " # print(ff_qk_prods_softmax2[:2,:,0])\n", + " assert(pct_mismatch <= 0.1)\n", + "\n", + " # qk prods softmax right after softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.pre_softmax.gi_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " compare_loaded_tensors(hf_qk_prods_softmax2, ff_qk_prods_softmax2)\n", + " \n", + " # qk prods softmax after mask\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.matmul_op.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in_masked\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + "\n", + " # Compare query activation\n", + " hf_query_activation = hf_path + f\"/fwd_step_0_layers.11.self_attn.query_activation.output_0\"\n", + " hf_query_activation = torch.load(hf_query_activation)\n", + " ff_query_activation = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_query_activation\"\n", + " ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n", + " hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " # assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n", + " # print(hf_query_activation[:,0,:])\n", + " # print()\n", + " # print(ff_query_activation[:,0,:])\n", + " # assert False\n", + " # compare_loaded_tensors(hf_query_activation, ff_query_activation)\n", + " check_rope = False\n", + " if check_rope:\n", + " ########################################## ROPE and Kproj ##########################################\n", + "\n", + " # Compare FF kproj with intermediate kproj data from HF\n", + " hf_kproj_grads_post_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_post_rotary.go_0\"\n", + " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n", + " hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", + " # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", + " # Check hf ROPE \n", + " cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n", + " cos = cos.cuda()\n", + " sin = sin.cuda()\n", + " # query_states: torch.Size([1, 12, 24, 64])\n", + " # key_states: torch.Size([1, 12, 24, 64])\n", + " # position_ids: torch.Size([1, 24])\n", + " # tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " # 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + " query_states = torch.zeros([1, 12, 24, 64]).cuda()\n", + " position_ids = torch.arange(24).unsqueeze(0).cuda()\n", + " query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n", + " hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", + " # print(hf_kproj_grads_post_rotary[:,:,0])\n", + " \n", + " hf_kproj_grads_before_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_before_rotary.go_0\"\n", + " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n", + " hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", + " # print(hf_kproj_grads_before_rotary[:,:,0])\n", + " # Compare HF rope with manual ROPE\n", + " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " # Compare HF Kproj with FF Kproj (before ROPE) \n", + " ff_kproj_pre = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj_pre\"\n", + " ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", + " #print(ff_kproj_pre[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " \n", + " ff_kproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj\"\n", + " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj: \", ff_kproj.shape)\n", + " #print(ff_kproj[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " \n", + " \n", + " #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", + " hf_kproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0\"\n", + " hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n", + " #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n", + " #print(hf_kproj_grads[:,:64])\n", + " reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " #print(reshaped_tensor.shape)\n", + " assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " ########################################## Qproj (with ROPE) ##########################################\n", + "\n", + " # Compare QProj\n", + " hf_qproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0\"\n", + " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", + " # print(\"HF Qproj:\")\n", + " # print(hf_qproj_grads.shape)\n", + " reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " # print(\"\\t reshaped: \", reshaped_tensor.shape)\n", + " # print(reshaped_tensor[:,:,0])\n", + " ff_qproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n", + " ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n", + " # print(\"FF Qproj:\")\n", + " # print(ff_qproj.shape)\n", + " # print(ff_qproj[:,:,0])\n", + " assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " hf_attn_in = f\"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0\"\n", + " hf_attn_in = torch.load(hf_attn_in)\n", + " hf_attn_in = hf_attn_in.squeeze().T\n", + " hf_attn_in = hf_attn_in.detach().cpu().numpy()\n", + " print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " print(hf_attn_in)\n", + "\n", + " ff_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in\"\n", + " ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n", + " print(\"ff_attn_in: \", ff_attn_in.shape)\n", + " print(ff_attn_in)\n", + " #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n", + "\n", + " mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n", + " mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n", + " print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n", + " assert(pct_mismatch <= 0.1)\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.01614726 0.01363804 0.01768043 ... 0.00724926 -0.00149747\n", + " -0.01781223]\n" + ] + } + ], + "source": [ + "a = np.fromfile(\"/usr0/home/goliaro/.cache/flexflow/weights/goliaro/llama-160m-lora-full/full-precision/layers_11_feed_forward_w2_lora_A_weight\", dtype=np.float32)\n", + "print(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# value states: torch.Size([1, 12, 24, 64])\n", + "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n", + "key_states = value_states\n", + "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n", + "# query_states: torch.Size([1, 12, 24, 64])\n", + "# key_states: torch.Size([1, 12, 24, 64])\n", + "# position_ids: torch.Size([1, 24])\n", + "# tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + "# 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + "query_states = torch.zeros([1, 12, 24, 64])\n", + "position_ids = torch.arange(24).unsqueeze(0)\n", + "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n", + "key_states = key_states.squeeze()\n", + "print(key_states.shape)\n", + "print(key_states[0,:,:])\n", + "print(hf_kproj_grads_before_rotary.shape)\n", + "print(hf_kproj_grads_before_rotary[:,:,0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " 18, 19, 20, 21, 22, 23]], device='cuda:0')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.arange(24).unsqueeze(0).cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 12, 24, 24])\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 17\u001b[0m ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m 18\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> 19\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 21\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "layer_num = 11\n", + "hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n", + "ff_qk_prods_softmax = f\"{ff_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + "\n", + "hf_value_states = f\"{hf_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", + "\n", + "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n", + "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n", + "print(hf_qk_prods_softmax.shape)\n", + "#print(ff_qk_prods_softmax.shape)\n", + "#print(hf_qk_prods_softmax[:,:,0])\n", + "#print()\n", + "#print(ff_qk_prods_softmax[:,:,0])\n", + "\n", + "for head_idx in range(12):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + "\n", + "\n", + "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n", + "print(hf_value_states.shape)\n", + "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n", + "print()\n", + "print(attn_output.shape)\n", + "print(attn_output.transpose(1, 2).contiguous().shape)\n", + "print(\"Hf attn heads\")\n", + "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n", + "\n", + "print(\"Attn heads grads:\")\n", + "hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + "print(torch.load(hf_attn_heads_grads).shape)\n", + "print(\"HF value grads:\")\n", + "vproj_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", + "print(torch.load(vproj_grads).shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 3, 4])\n", + "torch.Size([4, 3, 2])\n" + ] + } + ], + "source": [ + "a = torch.randn(2,3,4)\n", + "print(a.shape)\n", + "print(a.T.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000,\n", + " 0.0000],\n", + " [ 27.8890, -21.5089, 45.8214, ..., 5.4010, -10.8787,\n", + " 39.7619],\n", + " [ 19.2197, 27.4681, -68.7141, ..., 102.3280, 66.7925,\n", + " -160.8711],\n", + " ...,\n", + " [ 63.9532, 17.4273, -29.4416, ..., 101.6105, 67.5937,\n", + " -198.4432],\n", + " [ 31.2799, 13.0724, -44.7179, ..., 132.4898, 42.3135,\n", + " -194.4037],\n", + " [ 42.3453, -16.2693, -55.7386, ..., 90.5921, 52.2032,\n", + " -124.1802]]], device='cuda:0')\n", + "tensor([[[-1.1845e+06, -6.7460e+05, 7.4494e+05, ..., -9.1441e+05,\n", + " -1.4912e+05, 3.5769e+06],\n", + " [-7.3920e+01, -7.9389e+01, 1.1027e+02, ..., -7.3020e+01,\n", + " -2.3540e+01, 3.4587e+02],\n", + " [-5.3885e+01, -1.7373e+01, -1.9780e+01, ..., 4.1291e+01,\n", + " 5.5099e+01, 5.5910e+01],\n", + " ...,\n", + " [-2.1948e+01, -3.2109e+01, 2.8364e+01, ..., 3.4321e+01,\n", + " 5.0713e+01, 5.6592e+01],\n", + " [-4.4339e+01, -2.8339e+01, 1.4070e+01, ..., 6.2797e+01,\n", + " 3.0760e+01, 6.1743e+01],\n", + " [-1.6287e+01, -5.0413e+01, -1.9940e+01, ..., 4.3766e+01,\n", + " 4.7833e+01, 4.7295e+01]]], device='cuda:0')\n" + ] + } + ], + "source": [ + "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n", + "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n", + "a = torch.load(a)\n", + "b = torch.load(b)\n", + "print(a)\n", + "print(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "for layer_num in range(12):\n", + " hf_lora_A_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n", + " hf_w1_weight = f\"{hf_path}/layers.{layer_num}.mlp.gate_proj.weight\"\n", + " ff_w1_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n", + " hf_w3_weight = f\"{hf_path}/layers.{layer_num}.mlp.up_proj.weight\"\n", + " ff_w3_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n", + " hf_w2_weight = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.weight\"\n", + " ff_w2_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/alignment/opt_alignment_tests.ipynb b/tests/peft/alignment/opt_alignment_tests.ipynb new file mode 100644 index 0000000000..ca679b1857 --- /dev/null +++ b/tests/peft/alignment/opt_alignment_tests.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch\n", + "from align_test_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "\n", + "--- LM head ---\n", + "Ok!\n", + "Ok!\n", + "\n", + "--- Final Norm ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "qProjSize = 64\n", + "num_heads = 12\n", + "num_tokens = 25\n", + "for i in range(tot_num_layers):\n", + " hf_base = os.path.join(hf_path, f\"fwd_step_0_decoder.layers.{i}.\")\n", + " ff_base = os.path.join(ff_path, f\"fwd_step_0_layers_{i}_layers_{i}_\")\n", + " \n", + " # LayerNorm\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n", + " ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.output_0\"\n", + " ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_1\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + "\n", + " # # Attention QKV proj\n", + " # print(\"---Attn---\")\n", + " # ff_tensor = ff_base + \"attention_shard_0_qkv_proj_output\"\n", + " # ff_tensor = load_ff_tensor(ff_tensor, [qProjSize, num_heads, 3, num_tokens])\n", + " # ff_q_proj = ff_tensor[:,:,0,:]\n", + " # ff_k_proj = ff_tensor[:,:,1,:]\n", + " # ff_v_proj = ff_tensor[:,:,2,:]\n", + " # hf_q_proj = hf_base + \"self_attn.q_proj.output_0\"\n", + " # hf_q_proj = load_hf_tensor(hf_q_proj).squeeze().T\n", + " # hf_q_proj = hf_q_proj.reshape(12,64,25)\n", + " # hf_q_proj = np.transpose(hf_q_proj, (1,0,2))\n", + " # hf_k_proj = hf_base + \"self_attn.k_proj.output_0\"\n", + " # hf_k_proj = load_hf_tensor(hf_k_proj).squeeze().T\n", + " # hf_k_proj = hf_k_proj.reshape(12,64,25)\n", + " # hf_k_proj = np.transpose(hf_k_proj, (1,0,2))\n", + " # hf_v_proj = hf_base + \"self_attn.v_proj.output_0\"\n", + " # hf_v_proj = load_hf_tensor(hf_v_proj).squeeze().T\n", + " # hf_v_proj = hf_v_proj.reshape(12,64,25)\n", + " # hf_v_proj = np.transpose(hf_v_proj, (1,0,2))\n", + " # compare_loaded_tensors(hf_q_proj/np.sqrt(qProjSize), ff_q_proj)\n", + " # compare_loaded_tensors(hf_k_proj, ff_k_proj)\n", + " # compare_loaded_tensors(hf_v_proj, ff_v_proj)\n", + "\n", + " # Compare attn bias, residuals\n", + " print(\"--- Attn bias + residual ---\")\n", + " ff_residual1 = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_input_1\"\n", + " ff_residual2 = ff_base + \"attention_layer_norm_shard_0_output_0\"\n", + " compare_flexflow_tensors(ff_residual1, ff_residual2)\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n", + " compare_tensors(hf_tensor, ff_residual2)\n", + " ff_tensor = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_output_0\"\n", + " hf_tensor = hf_base + \"final_layer_norm.input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " \n", + " print(\"--- MLP ---\")\n", + " hf_tensor = hf_base + \"fc1.input_0\"\n", + " ff_tensor = ff_base + \"fc1_shard_0_input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " hf_tensor = hf_base + \"fc2.input_0\"\n", + " ff_tensor = ff_base + \"fc2_shard_0_input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + "# LM head\n", + "print(\"\\n--- LM head ---\")\n", + "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.input_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "# Final layer norm\n", + "print(\"\\n--- Final Norm ---\")\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.input_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "ff_tensor1 = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_input_activation\"\n", + "# compare_flexflow_tensors_shortest(ff_tensor, ff_tensor1)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.output_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_1\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_1\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_mean\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_2\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_rstd\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 22\u001b[0m\n\u001b[1;32m 19\u001b[0m compare_flexflow_tensors(ff_tensor, ff_tensor1)\n\u001b[1;32m 20\u001b[0m compare_tensors(hf_tensor, ff_tensor) \u001b[38;5;66;03m# fails\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# Compare fwd input/output of layernorm\u001b[39;00m\n\u001b[1;32m 25\u001b[0m hf_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_decoder.final_layer_norm.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "# Compare backward pass\n", + "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n", + "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n", + "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n", + "\n", + "hf_tensor1 = hf_path + \"/bwd_step_0_decoder.final_layer_norm.go_0\"\n", + "compare_hf_tensors(hf_tensor, hf_tensor1)\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n", + "compare_tensors(hf_tensor1, ff_tensor)\n", + "\n", + "hf_tensor = hf_path + \"/bwd_step_0_decoder.final_layer_norm.gi_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_0\"\n", + "ff_tensor1 = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_1\"\n", + "compare_flexflow_tensors(ff_tensor, ff_tensor1)\n", + "compare_tensors(hf_tensor, ff_tensor) # fails" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\n", + "HF: [ 0.0193019 -1.0467215 0.21579844 ... 0.04534929 -0.25642633\n", + " 0.10879952]\n", + "FF:[ 0.01458706 -1.02212262 0.20589906 ... 0.04446212 -0.25625792\n", + " 0.108039 ]\n", + "[ True False True ... True True True]\n", + "[ 1 3 7 ... 19170 19174 19188]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m hf_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 15\u001b[0m ff_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_fc1_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_fc1_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# LORA input\u001b[39;00m\n\u001b[1;32m 20\u001b[0m hf_lora_A_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:32\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for layer_num in range(tot_num_layers):\n", + " hf_input_ln_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.self_attn_layer_norm.output_0\"\n", + " ff_input_ln_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_layer_norm_shard-id_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", + " \n", + " hf_ffn_norm_in = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.input_0\"\n", + " ff_ffn_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_0\"\n", + " # compare_tensors(hf_ffn_norm_in, ff_ffn_norm_in)\n", + " \n", + " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_1\"\n", + " # compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", + " hf_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\"\n", + " ff_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\"\n", + " compare_tensors(hf_fc1_in, ff_fc1_in)\n", + "\n", + "\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out)\n", + "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.final_layer_norm.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\n", + "HF: [-0.00542103 -1.781267 0.16552497 ... -0.77217525 -0.5760026\n", + " 0.04363118]\n", + "FF:[ 0.03817766 -1.5644939 0.22477378 ... -0.94569921 -0.43960798\n", + " -0.06447437]\n", + "[False False False ... False False False]\n", + "[ 0 1 2 ... 19197 19198 19199]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 22\u001b[0m\n\u001b[1;32m 20\u001b[0m ff_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 21\u001b[0m ff_FWD_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_FWD_norm_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_FWD_norm_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n\u001b[1;32m 25\u001b[0m hf_BWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_decoder.final_layer_norm.gi_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:29\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "\n", + "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "\n", + "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "\n", + "# Compare fwd input/output of layernorm\n", + "hf_FWD_norm_in = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.input_0\"\n", + "hf_FWD_norm_out = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.output_0\"\n", + "ff_FWD_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\"\n", + "ff_FWD_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_FWD_norm_in, ff_FWD_norm_in)\n", + "compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n", + "\n", + "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/alignment_tests.ipynb b/tests/peft/alignment_tests.ipynb deleted file mode 100644 index e2a8978ea3..0000000000 --- a/tests/peft/alignment_tests.ipynb +++ /dev/null @@ -1,1427 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import os, torch" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "hf_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors\"\n", - "ff_weight_base_path = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors\"\n", - "def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2):\n", - " assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n", - " hf_tensor = torch.load(hf_tensor_filepath)\n", - " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", - " assert(len(hf_tensor) == 1)\n", - " hf_tensor = hf_tensor[0]\n", - " hf_tensor = torch.nan_to_num(hf_tensor)\n", - " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", - " ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", - "\n", - " len_hf_tensor = hf_tensor.shape[0]\n", - " ff_tensor = ff_tensor[:len_hf_tensor]\n", - " \n", - " mismatches = []\n", - " if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n", - " print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}\")\n", - " print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n", - " print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n", - " mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n", - " print(mismatches)\n", - " #print(np.nonzero(hf_tensor)[0])\n", - " # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n", - " # print(ff_tensor[36], hf_tensor[36])\n", - " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", - " assert(len(mismatches) <= .05*len_hf_tensor)\n", - " print(\"Ok!\")\n", - "def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2):\n", - " assert(os.path.exists(hf_tensor_filepath))\n", - " assert(os.path.exists(ff_tensor1_filepath))\n", - " assert(os.path.exists(ff_tensor2_filepath))\n", - " hf_tensor = torch.load(hf_tensor_filepath)\n", - " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", - " assert(len(hf_tensor) == 1)\n", - " hf_tensor = hf_tensor[0]\n", - " hf_tensor = torch.nan_to_num(hf_tensor)\n", - " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", - " ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=',')\n", - " ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=',')\n", - "\n", - " len_hf_tensor = hf_tensor.shape[0]\n", - " ff_tensor1 = ff_tensor1[:len_hf_tensor]\n", - " ff_tensor2 = ff_tensor2[:len_hf_tensor]\n", - " ff_tensor = ff_tensor1 - ff_tensor2\n", - " \n", - " mismatches = []\n", - " if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):\n", - " print(f\"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}\")\n", - " print(f\"HF: {hf_tensor}\\nFF:{ff_tensor}\")\n", - " print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))\n", - " mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]\n", - " print(mismatches)\n", - " #print(np.nonzero(hf_tensor)[0])\n", - " # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\n", - " # print(ff_tensor[36], hf_tensor[36])\n", - " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", - " assert(len(mismatches) <= .05*len_hf_tensor)\n", - " print(\"Ok!\")\n", - "def compare_hf_tensors(tensor1_fp, tensor2_fp):\n", - " assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n", - " hf_tensor1 = torch.load(tensor1_fp)\n", - " hf_tensor2 = torch.load(tensor2_fp)\n", - " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", - " assert(len(hf_tensor1) == 1)\n", - " hf_tensor1 = hf_tensor1[0]\n", - " if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n", - " assert(len(hf_tensor2) == 1)\n", - " hf_tensor2 = hf_tensor2[0]\n", - " assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n", - " hf_tensor1 = torch.nan_to_num(hf_tensor1)\n", - " hf_tensor2 = torch.nan_to_num(hf_tensor2)\n", - " if not (np.allclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy())):\n", - " print(f\"mismatch between {tensor1_fp} and {tensor2_fp}\")\n", - " print(hf_tensor1)\n", - " print(hf_tensor2)\n", - " print(np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))\n", - " mismatches = np.where(~np.isclose(hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()))[0]\n", - " print(mismatches)\n", - " assert(False)\n", - " print(\"Ok!\")\n", - "\n", - "def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp):\n", - " assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp))\n", - " hf_tensor_sum = torch.load(tensor_sum_fp)\n", - " hf_tensor1 = torch.load(tensor1_fp)\n", - " hf_tensor2 = torch.load(tensor2_fp)\n", - " if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:\n", - " assert(len(hf_tensor_sum) == 1)\n", - " hf_tensor_sum = hf_tensor_sum[0]\n", - " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", - " assert(len(hf_tensor1) == 1)\n", - " hf_tensor1 = hf_tensor1[0]\n", - " if type(hf_tensor2) == tuple or type(hf_tensor2) == list:\n", - " assert(len(hf_tensor2) == 1)\n", - " hf_tensor2 = hf_tensor2[0]\n", - " assert(torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape)\n", - " assert(torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape)\n", - " hf_tensor1 = torch.nan_to_num(hf_tensor1)\n", - " hf_tensor2 = torch.nan_to_num(hf_tensor2)\n", - " hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)\n", - " sum_check_tensor = hf_tensor1 + hf_tensor2\n", - " if not (np.allclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy())):\n", - " print(f\"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}\")\n", - " print(tensor_sum_fp)\n", - " print(sum_check_tensor)\n", - " print(hf_tensor1)\n", - " print(hf_tensor2)\n", - " print(np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))\n", - " mismatches = np.where(~np.isclose(sum_check_tensor.detach().cpu().numpy(), hf_tensor_sum.detach().cpu().numpy()))[0]\n", - " print(mismatches)\n", - " assert(False)\n", - " print(\"Ok!\")\n", - "def check_hf_zero_tensor(hf_tensor_fp):\n", - " assert(os.path.exists(hf_tensor_fp))\n", - " hf_tensor1 = torch.load(hf_tensor_fp)\n", - " if type(hf_tensor1) == tuple or type(hf_tensor1) == list:\n", - " assert(len(hf_tensor1) == 1)\n", - " hf_tensor1 = hf_tensor1[0]\n", - " assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0)\n", - "def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=\"\"):\n", - " assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath))\n", - " hf_tensor = torch.load(hf_tensor_filepath)\n", - " if type(hf_tensor) == tuple or type(hf_tensor) == list:\n", - " assert(len(hf_tensor) == 1)\n", - " hf_tensor = hf_tensor[0]\n", - " hf_tensor = torch.nan_to_num(hf_tensor)\n", - " hf_tensor = hf_tensor.flatten().detach().cpu().numpy()\n", - " ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", - "\n", - " len_hf_tensor = hf_tensor.shape[0]\n", - " ff_tensor = ff_tensor[:len_hf_tensor]\n", - "\n", - " print(f\"{txt} - HF tensor:\")\n", - " print(hf_tensor)\n", - " print(f\"{txt} - FF tensor: \")\n", - " print(ff_tensor)\n", - "def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1):\n", - " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", - " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", - " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", - "\n", - " if (ff_tensor1.shape != ff_tensor2.shape):\n", - " print(ff_tensor1.shape, ff_tensor2.shape)\n", - " assert(ff_tensor1.shape == ff_tensor2.shape)\n", - "\n", - " if max_len > -1:\n", - " ff_tensor1 = ff_tensor1[:max_len]\n", - " ff_tensor2 = ff_tensor2[:max_len]\n", - " \n", - " mismatches = []\n", - " if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n", - " print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n", - " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", - " print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n", - " mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n", - " print(mismatches)\n", - " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", - " assert(len(mismatches) <= .05*len(ff_tensor1))\n", - " print(\"Ok!\")\n", - "def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n", - " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", - " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", - " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", - " minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])\n", - " ff_tensor1 = ff_tensor1[:minlen]\n", - " ff_tensor2 = ff_tensor2[:minlen]\n", - " mismatches = []\n", - " if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):\n", - " print(f\"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}\")\n", - " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", - " print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))\n", - " mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]\n", - " print(mismatches)\n", - " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", - " assert(len(mismatches) <= .05*len(ff_tensor1))\n", - " print(\"Ok!\")\n", - "def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5):\n", - " assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp))\n", - " ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',')\n", - " ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',')\n", - " ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',')\n", - " \n", - " ff_sum = ff_tensor1 + ff_tensor2\n", - " assert(ff_tensor1.shape == ff_tensor2.shape)\n", - " \n", - " mismatches = []\n", - " if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):\n", - " print(f\"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}\")\n", - " print(f\"Tensor1: {ff_tensor1}\\nTensor2:{ff_tensor2}\")\n", - " print(f\"Sum Tensor: {ff_tensor_sum}\\nActual sum:{ff_sum}\")\n", - " print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))\n", - " mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]\n", - " print(mismatches)\n", - " #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\n", - " assert(len(mismatches) <= .05*len(ff_tensor1))\n", - " print(\"Ok!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n" - ] - } - ], - "source": [ - "tot_num_layers = 12\n", - "for layer_num in range(tot_num_layers):\n", - " hf_input_ln_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.input_layernorm.output_0\"\n", - " ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_RMSNorm_shard-id_0_output_0\"\n", - " if layer_num > 0:\n", - " ff_input_ln_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_norm_shard-id_0_output_1\"\n", - " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", - " hf_attn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.o_proj.output_0\"\n", - " ff_attn_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n", - " compare_tensors(hf_attn_out, ff_attn_out)\n", - " hf_ffn_norm_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.post_attention_layernorm.output_0\"\n", - " ff_ffn_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_1\"\n", - " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", - " # w1\n", - " hf_gate_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n", - " ff_gate_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", - " compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n", - " # w3\n", - " hf_up_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\" \n", - " ff_up_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", - " compare_tensors(hf_up_proj_out, ff_up_proj_out)\n", - " # w2\n", - " hf_down_proj_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.input_0\"\n", - " hf_down_proj_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.output_0\"\n", - " ff_down_proj_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n", - " ff_down_proj_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n", - " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", - " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", - " # LORA input\n", - " hf_lora_A_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n", - " ff_lora_A_in = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", - " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", - " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", - " # LORA weights\n", - " hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", - " ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", - " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", - " hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", - " ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", - " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", - " # LORA intermediate hf\n", - " hf_lora_A_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n", - " hf_lora_B_in = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n", - " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", - " # LORA output\n", - " hf_lora_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n", - " ff_lora_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", - " # compare_tensors(hf_lora_out, ff_lora_out)\n", - " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", - " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", - " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", - " \n", - "\n", - "# After last layer only\n", - "hf_norm_out = f\"{hf_weight_base_path}/fwd_step_0_norm.output_0\"\n", - "ff_norm_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n", - "compare_tensors(hf_norm_out, ff_norm_out)\n", - "hf_lm_head_out = f\"{hf_weight_base_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", - "ff_lm_head_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", - "compare_tensors(hf_lm_head_out, ff_lm_head_out)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n" - ] - } - ], - "source": [ - "tot_num_layers = 12\n", - "\n", - "ff_BWD_softmax_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", - "\n", - "hf_BWD_lm_head_out = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", - "ff_BWD_lm_head_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", - "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", - "# compare weights\n", - "hf_lm_head_weight = f\"{hf_weight_base_path}/base_model.model.lm_head.weight\"\n", - "ff_lm_head_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_weight_0\"\n", - "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n", - "hf_BWD_lm_head_in = f\"{hf_weight_base_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", - "ff_BWD_lm_head_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_input_0\"\n", - "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", - "# # Manually check the matmul\n", - "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n", - "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n", - "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n", - "# print(ff_tensor_out.shape)\n", - "# print(ff_weight.shape)\n", - "# print(np.matmul(ff_weight, ff_tensor_out))\n", - "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n", - "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", - "\n", - "hf_BWD_norm_out = f\"{hf_weight_base_path}/bwd_step_0_norm.go_0\"\n", - "ff_BWD_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_0\"\n", - "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", - "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", - "ff_BWD_norm_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_weight_0\"\n", - "hf_FWD_norm_weight = f\"{hf_weight_base_path}/base_model.model.model.norm.weight\"\n", - "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n", - "hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n", - "ff_BWD_norm_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_input_1\"\n", - "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from torch import nn\n", - "class LlamaRotaryEmbedding(nn.Module):\n", - " def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n", - " super().__init__()\n", - "\n", - " self.dim = dim\n", - " self.max_position_embeddings = max_position_embeddings\n", - " self.base = base\n", - " inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n", - " self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n", - "\n", - " # Build here to make `torch.jit.trace` work.\n", - " self._set_cos_sin_cache(\n", - " seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n", - " )\n", - "\n", - " def _set_cos_sin_cache(self, seq_len, device, dtype):\n", - " self.max_seq_len_cached = seq_len\n", - " t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n", - "\n", - " freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n", - " # Different from paper, but it uses a different permutation in order to obtain the same calculation\n", - " emb = torch.cat((freqs, freqs), dim=-1)\n", - " self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n", - " self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n", - "\n", - " def forward(self, x, seq_len=None):\n", - " # x: [bs, num_attention_heads, seq_len, head_size]\n", - " if seq_len > self.max_seq_len_cached:\n", - " self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n", - "\n", - " return (\n", - " self.cos_cached[:seq_len].to(dtype=x.dtype),\n", - " self.sin_cached[:seq_len].to(dtype=x.dtype),\n", - " )\n", - "def rotate_half(x):\n", - " \"\"\"Rotates half the hidden dims of the input.\"\"\"\n", - " x1 = x[..., : x.shape[-1] // 2] # first half\n", - " x2 = x[..., x.shape[-1] // 2 :] # second half\n", - " return torch.cat((x2, -x1), dim=-1)\n", - "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n", - " \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n", - "\n", - " Args:\n", - " q (`torch.Tensor`): The query tensor.\n", - " k (`torch.Tensor`): The key tensor.\n", - " cos (`torch.Tensor`): The cosine part of the rotary embedding.\n", - " sin (`torch.Tensor`): The sine part of the rotary embedding.\n", - " position_ids (`torch.Tensor`):\n", - " The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n", - " used to pass offsetted position ids when working with a KV-cache.\n", - " unsqueeze_dim (`int`, *optional*, defaults to 1):\n", - " The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n", - " sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n", - " that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n", - " k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n", - " cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n", - " the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n", - " Returns:\n", - " `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n", - " \"\"\"\n", - " cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n", - " sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n", - " q_embed = (q * cos) + (rotate_half(q) * sin)\n", - " k_embed = (k * cos) + (rotate_half(k) * sin)\n", - " return q_embed, k_embed\n", - "head_dim = 64\n", - "max_position_embeddings = 2048\n", - "rope_theta=10_000\n", - "kv_seq_len = 24\n", - "rotary_emb = LlamaRotaryEmbedding(\n", - " head_dim,\n", - " max_position_embeddings=max_position_embeddings,\n", - " base=rope_theta,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Huggingface checks:\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "\n", - "FlexFlow checks:\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "\n", - "Huggingface-FlexFlow checks:\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_SigmoidSiluMulti_shard-id_0_output_0\n", - "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", - " 1.2096541e+01 3.6424692e+00]\n", - "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 2.14103374e+01\n", - " 1.20965424e+01 3.64246750e+00]\n", - "[ True True True ... True True True]\n", - "[2394]\n", - "Ok!\n", - "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_bwd-step_0_layer-num_11_layer-name_layers_11_feed_forward_w2_shard-id_0_input_0\n", - "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", - " 1.2096541e+01 3.6424692e+00]\n", - "FF:[ 6.43506250e+03 -6.48986000e+05 1.17611156e+05 ... 2.14103374e+01\n", - " 1.20965424e+01 3.64246750e+00]\n", - "[ True True True ... True True True]\n", - "[2394]\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "4.383680555555555% mismatch in QK prods softmax out grad\n", - "3.9116753472222223% mismatch between HF and FF for kproj (before applying ROPE)\n", - "3.9008246527777777% mismatch between HF and FF for kproj (after applying ROPE)\n", - "4.817708333333334% mismatch in attention input grads\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 353\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpct_mismatch\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m100\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% mismatch in attention input grads\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 350\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(pct_mismatch \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.05\u001b[39m)\n\u001b[0;32m--> 353\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "\u001b[0;31mAssertionError\u001b[0m: " - ] - } - ], - "source": [ - "tot_num_layers = 12\n", - "for layer_num in range(tot_num_layers-1, -1, -1):\n", - " # HuggingFace filepaths\n", - " hf_BWD_norm_in = f\"{hf_weight_base_path}/bwd_step_0_norm.gi_0\"\n", - " hf_BWD_loraB_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.go_0\"\n", - " hf_BWD_loraB_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.gi_0\"\n", - " hf_BWD_loraA_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.go_0\"\n", - " hf_BWD_loraA_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.gi_0\"\n", - " hf_loraA_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", - " hf_loraB_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", - " hf_BWD_lora_dropout_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.go_0\"\n", - " hf_BWD_lora_dropout_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.lora_dropout.default.gi_0\"\n", - " hf_BWD_w2_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.go_0\"\n", - " hf_BWD_w2_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.down_proj.gi_0\"\n", - " hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n", - " hf_BWD_w3_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.go_0\"\n", - " hf_BWD_w3_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.up_proj.gi_0\"\n", - " hf_BWD_w1_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.go_0\"\n", - " hf_BWD_w1_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.gate_proj.gi_0\"\n", - " hf_BWD_act_fn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.gi_0\"\n", - " hf_BWD_act_fn_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.mlp.act_fn.go_0\"\n", - " hf_BWD_ffn_norm_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.go_0\"\n", - " hf_BWD_ffn_norm_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.post_attention_layernorm.gi_0\"\n", - " hf_BWD_attn_out_out = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.go_0\"\n", - " hf_BWD_attn_q_in = f\"{hf_weight_base_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n", - " hf_FWD_w1_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.gate_proj.output_0\"\n", - " hf_FWD_w3_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.up_proj.output_0\"\n", - " hf_FWD_act_fn_out = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.mlp.act_fn.output_0\"\n", - " hf_BWD_attn_oproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", - " hf_attn_qproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.q_proj.weight\"\n", - " hf_attn_kproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.k_proj.weight\"\n", - " hf_attn_vproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.v_proj.weight\"\n", - " hf_attn_oproj_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.self_attn.o_proj.weight\"\n", - " # hf_BWD_attn_vproj_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", - " # FlexFlow filepaths\n", - " ff_BWD_w2_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_output_0\"\n", - " ff_BWD_w2_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_input_0\"\n", - " ff_BWD_w2_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_pre_input_0\"\n", - " ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", - " ff_BWD_ssm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_output_0\"\n", - " ff_BWD_ssm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_0\"\n", - " ff_BWD_ssm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_input_1\"\n", - " ff_BWD_w3_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", - " ff_BWD_w3_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_input_0\"\n", - " ff_BWD_lora_A_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", - " ff_BWD_lora_B_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", - " ff_lora_A_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", - " ff_lora_B_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", - " ff_BWD_w1_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", - " ff_BWD_w1_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_input_0\"\n", - " ff_BWD_w1_in_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_pre_input_0\"\n", - " ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", - " ff_BWD_ffn_norm_in1 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_0\"\n", - " ff_BWD_ffn_norm_in2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_input_1\"\n", - " ff_BWD_ffn_norm_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_ffn_norm_shard-id_0_output_0\"\n", - " ff_BWD_attn_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_output_0\"\n", - " ff_BWD_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_input_0\"\n", - " ff_BWD_ssm_cached_w1_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w1_output\"\n", - " ff_BWD_ssm_cached_w3_input = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_cached_w3_output\"\n", - " ff_FWD_w1_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_output_0\"\n", - " ff_FWD_w3_out = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_output_0\"\n", - " ff_FWD_act_fnc_out = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_SigmoidSiluMulti_shard-id_0_act_fn_output\"\n", - " ff_BWD_attn_o_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", - " # ff_BWD_attn_v_proj_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n", - " ff_attn_oproj_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_11_layer-name_layers_11_attention_shard-id_0_weight_0\"\n", - " # ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", - "\n", - " # xxx = torch.load(hf_BWD_attn_out_out)\n", - " # xxx.detach().cpu().numpy().tofile(f\"{hf_BWD_attn_out_out}.flexflow\")\n", - " # print(f\"{hf_BWD_attn_out_out}.flexflow\")\n", - " \n", - " # HuggingFace checks\n", - " print(\"\\nHuggingface checks:\")\n", - " if layer_num == tot_num_layers-1:\n", - " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n", - " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n", - " compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n", - " compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n", - " # compare_hf_tensors(hf_BWD_w3_out, hf_BWD_w2_out)\n", - " compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n", - " check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n", - " check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n", - "\n", - " # FlexFlow checks\n", - " print(\"\\nFlexFlow checks:\")\n", - " compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n", - " compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n", - " compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n", - " compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n", - " compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n", - " compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n", - " compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n", - " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n", - " #compare_flexflow_tensors(ff_BWD_ffn_norm_in2, ff_BWD_attn_out, max_len=24*768) # should fail\n", - "\n", - " # HF-FlexFlow checks\n", - " print(\"\\nHuggingface-FlexFlow checks:\")\n", - " compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n", - " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", - " #print(torch.load(hf_w2_weight).shape)\n", - " compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n", - " compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n", - "\n", - " compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n", - " compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", - "\n", - " compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n", - " compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n", - " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", - " compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n", - "\n", - " compare_tensors(hf_FWD_w1_out, ff_FWD_w1_out)\n", - " compare_tensors(hf_FWD_w3_out, ff_FWD_w3_out)\n", - " compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", - " compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n", - " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", - " # compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n", - " # compare_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2)\n", - " # compare_tensors(hf_BWD_attn_out_out, ff_BWD_ffn_norm_in2)\n", - " compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n", - "\n", - " # compare attn weight tensors\n", - " hidden_size = 768\n", - " qProjSize = 64\n", - " num_heads = 12\n", - " num_new_tokens = num_tokens = 24\n", - " ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n", - " ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", - " ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", - " ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", - " ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n", - " \n", - " hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n", - " hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n", - " hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n", - " hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n", - " \n", - " assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n", - " assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n", - " assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n", - " assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n", - " \n", - " # Compare attn outproj grad in tensors\n", - " compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n", - " \n", - " ########### Compare value projs grads ######################\n", - " # 1. compare qk prods softmax\n", - " hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.qk_prods_softmax\"\n", - " ff_attn_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", - " \n", - " hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n", - " ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", - "\n", - " for head_idx in range(num_heads):\n", - " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", - " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", - " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", - " \n", - " # 2. compare attn heads grads\n", - " hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", - " ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", - "\n", - " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n", - " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n", - " assert(np.allclose(ff_attn_heads_grads, hf_attn_heads_grads, atol=1e-2))\n", - "\n", - " # 3. vproj grads\n", - " hf_vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.go_0\"\n", - " ff_vproj_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_v_proj_in_grad\"\n", - "\n", - " hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n", - " ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n", - " assert(np.allclose(hf_vproj_grads, ff_vproj_grads, atol=1e-2))\n", - "\n", - " \n", - " \n", - " \n", - " ##############################\n", - " hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", - " hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n", - " # print(hf_value_states.shape)\n", - " ff_value_states = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_vcache\"\n", - " ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n", - " # print(ff_value_states.shape)\n", - " assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n", - " \n", - " \n", - " \n", - " ########## Compare key and query projs grads ##################\n", - " ff_devQKVPRojArray = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n", - " ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n", - " ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n", - " ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n", - " ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n", - " assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n", - "\n", - " # simulate qk_prods_softmax\n", - " ff_attn_heads_grads = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_o_proj_in_grad\"\n", - " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n", - " ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n", - " ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n", - " ff_value_states = torch.from_numpy(ff_value_states)\n", - " ff_value_states = ff_value_states.permute(1,0,2)\n", - " # print(ff_attn_heads_grads.shape)\n", - " # print(ff_value_states.shape)\n", - " simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n", - " #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n", - " #print(\"Simulated QK prods grads:\")\n", - " #print(simulated_qk_prods_softmax_grads[0,:,:])\n", - "\n", - " # qk prods softmax right before softmax\n", - " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.go_0\"\n", - " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", - " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad\"\n", - " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", - " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", - " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", - " # assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", - " mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n", - " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", - " pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n", - " print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n", - " assert(pct_mismatch <= 0.05)\n", - "\n", - " # qk prods softmax right after softmax\n", - " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.softmax_op.gi_0\"\n", - " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", - " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in\"\n", - " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", - " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", - " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", - " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", - " \n", - " # qk prods softmax after mask\n", - " hf_qk_prods_softmax2 = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.matmul_op.go_0\"\n", - " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", - " ff_qk_prods_softmax2 = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax_grad_in_masked\"\n", - " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", - " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", - " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", - " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", - "\n", - " # Compare query activation\n", - " hf_query_activation = f\"{hf_weight_base_path}/fwd_step_0_layers.{layer_num}.self_attn.query_activation\"\n", - " hf_query_activation = torch.load(hf_query_activation)\n", - " ff_query_activation = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_query_activation\"\n", - " ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n", - " hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n", - " assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n", - " \n", - " ########################################## ROPE and Kproj ##########################################\n", - "\n", - " # Compare FF kproj with intermediate kproj data from HF\n", - " hf_kproj_grads_post_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_post_rotary.go_0\"\n", - " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n", - " hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", - " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", - " # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", - " # Check hf ROPE \n", - " cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n", - " cos = cos.cuda()\n", - " sin = sin.cuda()\n", - " # query_states: torch.Size([1, 12, 24, 64])\n", - " # key_states: torch.Size([1, 12, 24, 64])\n", - " # position_ids: torch.Size([1, 24])\n", - " # tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", - " # 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", - " query_states = torch.zeros([1, 12, 24, 64]).cuda()\n", - " position_ids = torch.arange(24).unsqueeze(0).cuda()\n", - " query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n", - " hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", - " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", - " # print(hf_kproj_grads_post_rotary[:,:,0])\n", - " \n", - " hf_kproj_grads_before_rotary = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.identity_kv_before_rotary.go_0\"\n", - " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n", - " hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", - " # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", - " # print(hf_kproj_grads_before_rotary[:,:,0])\n", - " # Compare HF rope with manual ROPE\n", - " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n", - " # Compare HF Kproj with FF Kproj (before ROPE) \n", - " ff_kproj_pre = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj_pre\"\n", - " ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", - " # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", - " #print(ff_kproj_pre[:,:,0])\n", - " mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", - " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", - " pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n", - " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n", - " assert(pct_mismatch <= 0.05)\n", - " #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", - " \n", - " ff_kproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devkproj\"\n", - " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", - " # print(\"ff_kproj: \", ff_kproj.shape)\n", - " #print(ff_kproj[:,:,0])\n", - " mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", - " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", - " pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n", - " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n", - " assert(pct_mismatch <= 0.05)\n", - " #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", - " \n", - " \n", - " #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", - " hf_kproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.k_proj.go_0\"\n", - " hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n", - " #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n", - " #print(hf_kproj_grads[:,:64])\n", - " reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", - " #print(reshaped_tensor.shape)\n", - " assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n", - "\n", - " ########################################## Qproj (with ROPE) ##########################################\n", - "\n", - " # Compare QProj\n", - " hf_qproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.q_proj.go_0\"\n", - " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", - " # print(\"HF Qproj:\")\n", - " # print(hf_qproj_grads.shape)\n", - " reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", - " # print(\"\\t reshaped: \", reshaped_tensor.shape)\n", - " # print(reshaped_tensor[:,:,0])\n", - " ff_qproj = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_devQKVPRojArray\"\n", - " ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n", - " # print(\"FF Qproj:\")\n", - " # print(ff_qproj.shape)\n", - " # print(ff_qproj[:,:,0])\n", - " assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n", - "\n", - " hf_attn_in = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.input_layernorm.go_0\"\n", - " hf_attn_in = torch.load(hf_attn_in)\n", - " # print(\"hf_attn_in: \", hf_attn_in.shape)\n", - " hf_attn_in = hf_attn_in.squeeze().T\n", - " hf_attn_in = hf_attn_in.detach().cpu().numpy()\n", - " # print(\"hf_attn_in: \", hf_attn_in.shape)\n", - " # print(hf_attn_in)\n", - "\n", - " ff_attn_in = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_attn_final_grad_in\"\n", - " ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n", - " # print(\"ff_attn_in: \", ff_attn_in.shape)\n", - " # print(ff_attn_in)\n", - " #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n", - "\n", - " mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n", - " mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n", - " pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n", - " print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n", - " assert(pct_mismatch <= 0.05)\n", - " \n", - "\n", - " assert False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([12, 24, 64])\n", - "tensor([[-1.5730e-02, -4.1161e-02, 3.0593e-02, ..., 3.8630e-01,\n", - " 3.2884e-01, 3.6067e-01],\n", - " [-2.8613e+01, -5.5872e+00, 2.9385e+01, ..., 3.8782e+01,\n", - " 9.6901e+01, 9.8470e+01],\n", - " [ 3.3027e+00, 1.8276e-01, -1.8497e+00, ..., -4.4052e+01,\n", - " -2.0010e+01, -2.9788e+01],\n", - " ...,\n", - " [-7.6471e-02, -1.8892e-01, 3.6430e-01, ..., -2.7493e-01,\n", - " 5.7017e-01, -1.5986e-01],\n", - " [ 2.5780e+00, -1.8153e+00, 2.5088e+00, ..., -1.0776e+01,\n", - " 6.2167e-01, 8.3755e-01],\n", - " [-6.8324e-02, 1.7568e-01, -3.2311e-01, ..., 3.1202e+00,\n", - " -2.6652e-01, -1.1917e+00]])\n", - "(24, 64, 12)\n", - "[[-1.5729919e-02 -4.1160699e-02 3.0592799e-02 ... 3.8629669e-01\n", - " 3.2884139e-01 3.6066702e-01]\n", - " [-2.8613457e+01 -5.5871558e+00 2.9384506e+01 ... 3.8781765e+01\n", - " 9.6900581e+01 9.8469597e+01]\n", - " [ 3.3027239e+00 1.8275940e-01 -1.8496730e+00 ... -4.4052174e+01\n", - " -2.0009745e+01 -2.9787930e+01]\n", - " ...\n", - " [-7.6470733e-02 -1.8891659e-01 3.6430117e-01 ... -2.7492592e-01\n", - " 5.7017130e-01 -1.5985624e-01]\n", - " [ 2.5780225e+00 -1.8152566e+00 2.5087588e+00 ... -1.0776262e+01\n", - " 6.2166649e-01 8.3755457e-01]\n", - " [-6.8324409e-02 1.7568478e-01 -3.2310838e-01 ... 3.1202292e+00\n", - " -2.6652411e-01 -1.1917179e+00]]\n" - ] - } - ], - "source": [ - "# value states: torch.Size([1, 12, 24, 64])\n", - "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n", - "key_states = value_states\n", - "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n", - "# query_states: torch.Size([1, 12, 24, 64])\n", - "# key_states: torch.Size([1, 12, 24, 64])\n", - "# position_ids: torch.Size([1, 24])\n", - "# tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", - "# 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", - "query_states = torch.zeros([1, 12, 24, 64])\n", - "position_ids = torch.arange(24).unsqueeze(0)\n", - "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n", - "key_states = key_states.squeeze()\n", - "print(key_states.shape)\n", - "print(key_states[0,:,:])\n", - "print(hf_kproj_grads_before_rotary.shape)\n", - "print(hf_kproj_grads_before_rotary[:,:,0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", - " 18, 19, 20, 21, 22, 23]], device='cuda:0')" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "torch.arange(24).unsqueeze(0).cuda()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1, 12, 24, 24])\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 17\u001b[0m ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m 18\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> 19\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 21\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n", - "\u001b[0;31mAssertionError\u001b[0m: " - ] - } - ], - "source": [ - "layer_num = 11\n", - "hf_qk_prods_softmax = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n", - "ff_qk_prods_softmax = f\"{ff_weight_base_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", - "\n", - "hf_value_states = f\"{hf_weight_base_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", - "\n", - "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n", - "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n", - "print(hf_qk_prods_softmax.shape)\n", - "#print(ff_qk_prods_softmax.shape)\n", - "#print(hf_qk_prods_softmax[:,:,0])\n", - "#print()\n", - "#print(ff_qk_prods_softmax[:,:,0])\n", - "\n", - "for head_idx in range(12):\n", - " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", - " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", - " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", - "\n", - "\n", - "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n", - "print(hf_value_states.shape)\n", - "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n", - "print()\n", - "print(attn_output.shape)\n", - "print(attn_output.transpose(1, 2).contiguous().shape)\n", - "print(\"Hf attn heads\")\n", - "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n", - "\n", - "print(\"Attn heads grads:\")\n", - "hf_attn_heads_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", - "print(torch.load(hf_attn_heads_grads).shape)\n", - "print(\"HF value grads:\")\n", - "vproj_grads = f\"{hf_weight_base_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", - "print(torch.load(vproj_grads).shape)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([2, 3, 4])\n", - "torch.Size([4, 3, 2])\n" - ] - } - ], - "source": [ - "a = torch.randn(2,3,4)\n", - "print(a.shape)\n", - "print(a.T.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000,\n", - " 0.0000],\n", - " [ 27.8890, -21.5089, 45.8214, ..., 5.4010, -10.8787,\n", - " 39.7619],\n", - " [ 19.2197, 27.4681, -68.7141, ..., 102.3280, 66.7925,\n", - " -160.8711],\n", - " ...,\n", - " [ 63.9532, 17.4273, -29.4416, ..., 101.6105, 67.5937,\n", - " -198.4432],\n", - " [ 31.2799, 13.0724, -44.7179, ..., 132.4898, 42.3135,\n", - " -194.4037],\n", - " [ 42.3453, -16.2693, -55.7386, ..., 90.5921, 52.2032,\n", - " -124.1802]]], device='cuda:0')\n", - "tensor([[[-1.1845e+06, -6.7460e+05, 7.4494e+05, ..., -9.1441e+05,\n", - " -1.4912e+05, 3.5769e+06],\n", - " [-7.3920e+01, -7.9389e+01, 1.1027e+02, ..., -7.3020e+01,\n", - " -2.3540e+01, 3.4587e+02],\n", - " [-5.3885e+01, -1.7373e+01, -1.9780e+01, ..., 4.1291e+01,\n", - " 5.5099e+01, 5.5910e+01],\n", - " ...,\n", - " [-2.1948e+01, -3.2109e+01, 2.8364e+01, ..., 3.4321e+01,\n", - " 5.0713e+01, 5.6592e+01],\n", - " [-4.4339e+01, -2.8339e+01, 1.4070e+01, ..., 6.2797e+01,\n", - " 3.0760e+01, 6.1743e+01],\n", - " [-1.6287e+01, -5.0413e+01, -1.9940e+01, ..., 4.3766e+01,\n", - " 4.7833e+01, 4.7295e+01]]], device='cuda:0')\n" - ] - } - ], - "source": [ - "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n", - "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n", - "a = torch.load(a)\n", - "b = torch.load(b)\n", - "print(a)\n", - "print(b)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # Manual matmul checks\n", - "# ff_w2_grad_out_tensor = np.loadtxt(ff_BWD_w2_out, delimiter=',').reshape((768,128), order='F')\n", - "# ff_w2_weight_tensor = np.loadtxt(ff_w2_weight, delimiter=',').reshape((3072,768), order='F')\n", - "# ff_w2_gradin_tensor = np.matmul(ff_w2_weight_tensor, ff_w2_grad_out_tensor).reshape((3072,128), order='F')\n", - "\n", - "# ff_lora_gradout_tensor = np.loadtxt(ff_BWD_lora_B_out, delimiter=',').reshape((768,128), order='F')\n", - "# ff_lora_A_weight_tensor = np.loadtxt(ff_lora_A_weight, delimiter=',').reshape((3072,16), order='F')\n", - "# ff_lora_B_weight_tensor = np.loadtxt(ff_lora_B_weight, delimiter=',').reshape((16,768), order='F')\n", - "# ff_lora_int_grad_tensor = np.matmul(ff_lora_B_weight_tensor, ff_lora_gradout_tensor)\n", - "# ff_lora_gradint_tensor = np.matmul(ff_lora_A_weight_tensor, ff_lora_int_grad_tensor)\n", - "\n", - "# # ff_w2_gradin_tensor = ff_w2_gradin_tensor + ff_lora_gradint_tensor\n", - "# #print(ff_w2_gradin_tensor[:,:24])\n", - "# print(\"calculated LORA grad in\")\n", - "# print(ff_lora_gradint_tensor[:,:24])\n", - "# # ff_BWD_w2_in_pre_tensor = np.loadtxt(ff_BWD_w2_in_pre, delimiter=',').reshape((3072,128), order='F')\n", - "# ff_BWD_lora_A_in_tensor = np.loadtxt(ff_BWD_lora_A_in, delimiter=',').reshape((3072,128), order='F')\n", - "# print(\"FlexFlow LORA grad in\")\n", - "# print(ff_BWD_lora_A_in_tensor[:,:24])\n", - "# # print(ff_BWD_w2_in_pre_tensor[:,:24])\n", - "# print(\"HF lora grad in\")\n", - "# print(torch.load(hf_BWD_loraA_in).squeeze().T.detach().cpu().numpy())\n", - "# compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", - "\n", - "# simulate act_fn_grad\n", - "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')\n", - "# w3_fwd_out_tensor = np.loadtxt(ff_FWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n", - "# #print(ssm_out_grad_tensor.shape, w3_fwd_out_tensor.shape)\n", - "# act_fn_out_check = np.multiply(ssm_out_grad_tensor, w3_fwd_out_tensor)\n", - "# print(\"simulated act fn out - simulated\")\n", - "# print(act_fn_out_check[:,:24])\n", - "# print(\"simulated act fn out - HF\")\n", - "# print(torch.load(hf_BWD_act_fn_out).detach().cpu().numpy().squeeze().T)\n", - "\n", - "# Simulated w3_grad\n", - "# ssm_out_grad_tensor = np.loadtxt(ff_BWD_ssm_out, delimiter=',').reshape((3072,128), order='F')[:,:24]\n", - "# act_fnc_out_tensor = np.loadtxt(ff_FWD_act_fnc_out, delimiter=',').reshape((3072,24), order='F')\n", - "# w3_out_gard_check = np.multiply(ssm_out_grad_tensor, act_fnc_out_tensor)\n", - "# print(\"simulated w3 out - FF\")\n", - "# print(w3_out_gard_check)\n", - "# ff_BWD_w3_out_tensor = np.loadtxt(ff_BWD_w3_out, delimiter=',').reshape((3072,128), order='F')\n", - "# hf_BWD_w3_out_tensor = torch.load(hf_BWD_w3_out).detach().cpu().numpy().squeeze().T\n", - "# print(\"w3 out, FF\")\n", - "# print(ff_BWD_w3_out_tensor[:,:24])\n", - "# print(\"w3 out, HF\")\n", - "# print(hf_BWD_w3_out_tensor)\n", - "\n", - "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n", - "# assert False\n", - "# print()\n", - "# print()\n", - "# print_tensors(hf_BWD_w3_out, ff_BWD_w3_out, \"w3 out\")\n", - "# print_tensors(hf_BWD_w3_in, ff_BWD_w3_in, \"w3 in\")\n", - "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out, \"w1 out\")\n", - "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in, \"w1 in\")\n", - "# print_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out, \"ffn norm out\")\n", - "# print_tensors(hf_BWD_ffn_norm_in, ff_BWD_ffn_norm_in2, \"ffn norm in\")\n", - "# print()\n", - "# ff_w1_out_tensor = np.loadtxt(ff_BWD_w1_out, delimiter=',').reshape((3072,128), order='F')\n", - "# ff_w1_in_tensor = np.loadtxt(ff_BWD_w1_in, delimiter=',').reshape((768,128), order='F')\n", - "# ff_w1_in_pre_tensor = np.loadtxt(ff_BWD_w1_in_pre, delimiter=',').reshape((768,128), order='F')\n", - "# ff_w1_only_in_tensor = ff_w1_in_tensor - ff_w1_in_pre_tensor\n", - "# ff_w1_weight_tensor = np.loadtxt(ff_w1_weight, delimiter=',').reshape((768,3072), order='F')\n", - "# ff_w1_in_check_tensor = np.matmul(ff_w1_weight_tensor, ff_w1_out_tensor)\n", - "# print(\"W1 in (simulated):\")\n", - "# print(ff_w1_in_check_tensor[:,:24])\n", - "# print(\"W1 in (FF):\")\n", - "# print(ff_w1_only_in_tensor[:,:24])\n", - "# print(\"W1 in (HF):\")\n", - "# print(torch.load(hf_BWD_w1_in).squeeze().T.detach().cpu().numpy())\n", - "\n", - "# compare_tensors_difference(hf_BWD_w2_in, ff_BWD_w2_in, ff_BWD_lora_A_in)\n", - "# compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", - "#compare_hf_tensors(hf_BWD_ffn_norm_in, hf_BWD_attn_out_out)\n", - "# print(\"\\nw1 out:\")\n", - "\n", - "# print_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", - "# print(\"\\nW1 in\\n\")\n", - "# print_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n", - "# compare_tensors(hf_BWD_w1_in, ff_BWD_w1_in)\n", - "# print(\"\\nffn_norm\")\n", - "# compare_tensors(hf_BWD_ffn_norm_out, ff_BWD_ffn_norm_out)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n" - ] - } - ], - "source": [ - "for layer_num in range(12):\n", - " hf_lora_A_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", - " ff_lora_A_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", - " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n", - " hf_lora_B_weight_fp = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", - " ff_lora_B_weight_fp = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", - " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n", - " hf_w1_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.gate_proj.weight\"\n", - " ff_w1_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", - " compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n", - " hf_w3_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.up_proj.weight\"\n", - " ff_w3_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n", - " compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n", - " hf_w2_weight = f\"{hf_weight_base_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.weight\"\n", - " ff_w2_weight = f\"{ff_weight_base_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", - " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", - " " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 7836633b30..1e0e0bd167 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -60,6 +60,7 @@ def lm_head_pre_backward_hook(module, grad_output): def peft_backward_hook(module, grad_input, grad_output): + assert(type(grad_input) == tuple and type(grad_output) == tuple) if len(grad_input) == 0 or len(grad_output) == 0: return assert module.name is not None and module.bwd_step is not None @@ -95,23 +96,53 @@ def peft_forward_hook(module, input, output): name = module.name.replace("base_model.model.model.", "") print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}") print("Input:") - for i, inp in enumerate(input): - if type(inp) == torch.Tensor: - print(inp.shape) - torch.save( - inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}" - ) - else: - print(inp) + if type(input) == torch.Tensor: + print(input.shape) + torch.save( + input, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_0" + ) + elif type(input) == tuple: + for i, inp in enumerate(input): + if type(inp) == torch.Tensor: + print(inp.shape) + torch.save( + inp, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.input_{i}" + ) + else: + print(inp) + else: + assert False print("Output:") - for i, out in enumerate(output): - if type(out) == torch.Tensor: - print(out.shape) - torch.save( - out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}" - ) - else: - print(out) + if type(output) == torch.Tensor: + print(output.shape) + torch.save( + output, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_0" + ) + # if "layer_norm" in name: + # torch.save( + # output.grad_fn._saved_result1, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_1" + # ) + # torch.save( + # output.grad_fn._saved_result2, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_2" + # ) + elif type(output) == tuple: + for i, out in enumerate(output): + if type(out) == torch.Tensor: + print(out.shape) + torch.save( + out, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.output_{i}" + ) + # if "layer_norm" in name: + # torch.save( + # out.grad_fn._saved_result1, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_1" + # ) + # torch.save( + # out.grad_fn._saved_result2, f"./hf_peft_tensors/fwd_step_{module.fwd_step}_{name}.saved_result_2" + # ) + else: + print(out) + else: + assert False # print("Forward Input/Output: ", input[0].shape, output[0].shape) print("===") module.fwd_step += 1 @@ -221,10 +252,13 @@ def main(): layer.register_full_backward_pre_hook(lm_head_pre_backward_hook) # Save any weights of interest for name, params in model.named_parameters(): + simplified_name = name.replace("base_model.model.model.", "") if "lora" in name: - torch.save(params, f"./hf_peft_tensors/{name}") + torch.save(params, f"./hf_peft_tensors/{simplified_name}") if "lm_head" in name or "norm" in name: - torch.save(params, f"./hf_peft_tensors/{name}") + torch.save(params, f"./hf_peft_tensors/{simplified_name}") + if "down_proj" in name or "self_attn" in name: + torch.save(params, f"./hf_peft_tensors/{simplified_name}") # Load fine-tuning dataset data = load_dataset("Abirate/english_quotes") diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index ad1f903cfb..1fde4d5a50 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -92,14 +92,6 @@ def main(): model = PeftModel.from_pretrained(model, peft_model_id) print(model) - for name, params in model.named_parameters(): - print(name) - if ( - name - == "base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight" - ): - print(params) - assert False # Register hooks to save tensors, if needed if save_peft_tensors: diff --git a/tests/peft/qk_prods_alignment.ipynb b/tests/peft/qk_prods_alignment.ipynb deleted file mode 100644 index c2a3644b3d..0000000000 --- a/tests/peft/qk_prods_alignment.ipynb +++ /dev/null @@ -1,24 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 32f0a15412eabdfb45bfce48cbd489a3e5ddbac5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 26 Jan 2024 22:45:50 -0500 Subject: [PATCH 127/198] fix legion aliasing error --- .../ops/add_bias_residual_layer_norm.h | 1 + src/ops/add_bias_residual_layer_norm.cc | 122 ++++++++---------- src/ops/linear.cc | 2 - src/ops/residual_layer_norm.cc | 51 +++----- src/ops/residual_rms_norm.cc | 58 ++++----- 5 files changed, 98 insertions(+), 136 deletions(-) diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index 38bb825a4d..550d56c47c 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -26,6 +26,7 @@ class AddBiasResidualLayerNorm : public Op { float _eps, bool allocate_weights, char const *name); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index a2b426ec0d..6b71279971 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -348,10 +348,13 @@ void AddBiasResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); // attn output + // added: attn_output + attn final bias + residual launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -362,34 +365,27 @@ void AddBiasResidualLayerNorm::init_inference( EXCLUSIVE, batch_inputs[1]->region)); launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(2, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(3, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(4, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -397,7 +393,7 @@ void AddBiasResidualLayerNorm::init_inference( READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(5, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -420,10 +416,13 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - // attn output + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); + // input: attn output + // added: attn_output + attn final bias + residual launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -434,34 +433,27 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[1]->region)); launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(2, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(3, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(4, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -469,7 +461,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(5, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -478,13 +470,11 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { } /* - regions[0](I): attn output - regions[1](I): residual - regions[2](O): added output (attn output + final attn bias + residual) - regions[3](O): layer norm output - regions[4](I): final attn bias - regions[5](I): gamma - regions[6](I): beta + regions[0](I/O): attn output AND added output (attn output + final attn bias + + residual) regions[1](I): residual regions[2](O): layer norm output + regions[3](I): final attn bias + regions[4](I): gamma + regions[5](I): beta */ OpMeta *AddBiasResidualLayerNorm::init_task( Task const *task, @@ -545,10 +535,13 @@ FutureMap AddBiasResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); // input + // added_output: input + attn bias + residual launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -566,20 +559,13 @@ FutureMap AddBiasResidualLayerNorm::inference( EXCLUSIVE, batch_inputs[1]->region)); launcher.add_field(2, FID_DATA); - // added_output: input + attn bias + residual - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(3, FID_DATA); // output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(3, FID_DATA); if (elementwise_affine) { // gamma launcher.add_region_requirement(RegionRequirement(weights[1]->part, @@ -587,7 +573,7 @@ FutureMap AddBiasResidualLayerNorm::inference( READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(4, FID_DATA); if (use_bias) { // beta launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -595,20 +581,31 @@ FutureMap AddBiasResidualLayerNorm::inference( READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(5, FID_DATA); } } return runtime->execute_index_space(ctx, launcher); } +void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); +} + /* - regions[0](I): input + regions[0](I): input / added output regions[1](I): attn bias regions[2](I): residual - regions[3](O): added output - regions[4](O): output - regions[5](I): gamma - regions[6](I): beta + regions[3](O): output + regions[4](I): gamma + regions[5](I): beta */ void AddBiasResidualLayerNorm::inference_task( Task const *task, @@ -626,7 +623,7 @@ void AddBiasResidualLayerNorm::inference_task( *((AddBiasResidualLayerNormMeta **)task->local_args); assert(regions.size() == - 5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); + 4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); @@ -635,9 +632,9 @@ void AddBiasResidualLayerNorm::inference_task( GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO( m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime); + m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); GenericTensorAccessorR gamma, beta; @@ -648,9 +645,9 @@ void AddBiasResidualLayerNorm::inference_task( Domain residual_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); + ctx, task->regions[0].region.get_index_space()); Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); + ctx, task->regions[3].region.get_index_space()); Domain gamma_domain, beta_domain; @@ -675,23 +672,23 @@ void AddBiasResidualLayerNorm::inference_task( if (m->elementwise_affine) { gamma = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[5], - task->regions[5], + regions[4], + task->regions[4], FID_DATA, ctx, runtime); gamma_domain = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); + ctx, task->regions[4].region.get_index_space()); if (m->use_bias) { beta = helperGetGenericTensorAccessorRO(m->weight_type[2], - regions[6], - task->regions[6], + regions[5], + task->regions[5], FID_DATA, ctx, runtime); beta_domain = runtime->get_index_space_domain( - ctx, task->regions[6].region.get_index_space()); + ctx, task->regions[5].region.get_index_space()); assert(gamma_domain == beta_domain); } @@ -723,12 +720,7 @@ void AddBiasResidualLayerNorm::inference_task( } } AddBiasResidualLayerNorm::save_inference_tensors_to_file( - m, - shard_id, - bc, - {input, residual}, - weights_accessors, - {added_output, output}); + m, shard_id, bc, {residual}, weights_accessors, {added_output, output}); } } diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 209f514f65..4563673385 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -621,8 +621,6 @@ void Linear::inference_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - std::string op_name_without_uid = Linear::get_op_name_without_uid(m); - printf("INF %s\n", op_name_without_uid.c_str()); if (bc->num_tokens == 0) { return; } diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 8563c299ab..dc302ce19c 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -358,11 +358,14 @@ void ResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); int field_id = 0; // input + // added: input + residual(s) launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); @@ -382,13 +385,6 @@ void ResidualLayerNorm::init_inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -433,11 +429,14 @@ void ResidualLayerNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); int field_id = 0; // input + // added: input + residual(s) launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); @@ -457,13 +456,6 @@ void ResidualLayerNorm::init(FFModel const &ff) { inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, @@ -884,11 +876,14 @@ FutureMap ResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); int field_id = 0; // input + // added: input + residual(s) launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); @@ -908,13 +903,6 @@ FutureMap ResidualLayerNorm::inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -956,7 +944,7 @@ void ResidualLayerNorm::inference_task( } assert(regions.size() == - 4 + m->use_two_residuals + + 3 + m->use_two_residuals + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); int region_idx = 0, task_region_idx = 0; @@ -984,13 +972,8 @@ void ResidualLayerNorm::inference_task( ctx, runtime); } - GenericTensorAccessorW added_output = - helperGetGenericTensorAccessorWO(m->output_type[0], - regions[region_idx++], - task->regions[task_region_idx++], - FID_DATA, - ctx, - runtime); + GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(m->output_type[1], regions[region_idx++], @@ -1029,7 +1012,7 @@ void ResidualLayerNorm::inference_task( assert(residual2_domain == in_domain); } Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[task_region_idx++].region.get_index_space()); + ctx, task->regions[0].region.get_index_space()); Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[task_region_idx++].region.get_index_space()); Domain gamma_domain, beta_domain; @@ -1069,7 +1052,7 @@ void ResidualLayerNorm::inference_task( assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector input_accessors; - input_accessors.push_back(input); + // input_accessors.push_back(input); input_accessors.push_back(residual1); if (m->use_two_residuals) { input_accessors.push_back(residual2); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index c2fbe11544..fb0944cece 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -261,6 +261,8 @@ void ResidualRMSNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -273,24 +275,18 @@ void ResidualRMSNorm::init(FFModel const &ff) { EXCLUSIVE, inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(3, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -318,9 +314,11 @@ void ResidualRMSNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -330,24 +328,18 @@ void ResidualRMSNorm::init_inference( EXCLUSIVE, batch_inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(3, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -398,6 +390,8 @@ FutureMap 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -410,40 +404,33 @@ FutureMap EXCLUSIVE, batch_inputs[1]->region)); launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_WRITE, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(3, FID_DATA); return runtime->execute_index_space(ctx, launcher); } /* - regions[0](I): input1 + regions[0](I/O): input1 / residual output regions[1](I): input2 - regions[2](O): residual output - regions[3](O): output - regions[4](I/O): weight + regions[2](O): output + regions[3](I/O): weight */ void ResidualRMSNorm::inference_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(task->regions.size() == 5); - assert(regions.size() == 5); + assert(task->regions.size() == 4); + assert(regions.size() == 4); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; @@ -453,19 +440,20 @@ void ResidualRMSNorm::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + // residual_output is mapped to the same region as the input GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); + m->output_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); inference_kernel_wrapper( m, bc, input1, input2, weight, residual_output, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; ResidualRMSNorm::save_inference_tensors_to_file( - m, shard_id, bc, {input1, input2}, {weight}, {residual_output, output}); + m, shard_id, bc, {input2}, {weight}, {residual_output, output}); } } From c97f63a368b22363b26667a6a963fee0170aea60 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 26 Jan 2024 23:45:04 -0500 Subject: [PATCH 128/198] fix warnings --- src/ops/lora_linear.cc | 14 ++++++++++++-- src/ops/residual_rms_norm.cc | 4 ++-- src/ops/rms_norm.cc | 2 +- src/ops/sigmoid_silu_multi.cc | 4 ++-- src/runtime/model.cc | 6 +++--- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index c02bddc5a6..409c814329 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -272,8 +272,6 @@ void load_peft_from_file(DT *ptr, size_t size, int shard_id, std::string filepath) { - std::cout << "Loading LORA weight " << filepath << ", size: " << size - << ", shard: " << shard_id << std::endl; std::ifstream in(filepath, std::ios::in | std::ios::binary); if (!in.good()) { printf("Could not open file: %s\n", filepath.c_str()); @@ -360,13 +358,25 @@ void LoraLinear::register_model_task(Task const *task, std::string w1_filepath = join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"}); if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" + << ", size: " << w0_num_elements << ", shard: " << shard_id + << std::endl; load_peft_from_file( (float *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" + << ", size: " << w1_num_elements << ", shard: " << shard_id + << std::endl; load_peft_from_file( (float *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath); } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" + << ", size: " << w0_num_elements << ", shard: " << shard_id + << std::endl; load_peft_from_file( (half *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" + << ", size: " << w1_num_elements << ", shard: " << shard_id + << std::endl; load_peft_from_file( (half *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath); } else { diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index fb0944cece..e549e5f6da 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -639,7 +639,7 @@ Legion::FutureMap launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); @@ -647,7 +647,7 @@ Legion::FutureMap launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(2, FID_DATA); diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index a1749d66af..b9c9206a00 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -548,7 +548,7 @@ Legion::FutureMap launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index c01f47aa21..c9f86c42cb 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -384,7 +384,7 @@ FutureMap launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); @@ -392,7 +392,7 @@ FutureMap launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, - READ_WRITE, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); launcher.add_field(2, FID_DATA); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 812a432ef1..9512a0c21a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -5546,7 +5546,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "RMS Norm Inference Task"); + registrar, "Residual RMS Norm Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; @@ -5562,7 +5562,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "RMS Norm Backward Task"); + registrar, "Residual RMS Norm Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; @@ -5577,7 +5577,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "RMS Norm PEFT Backward Task"); + registrar, "Residual RMS Norm PEFT Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; From 3d5a37c70cfb76485b35a6669b4ee90f97476bb9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 27 Jan 2024 17:31:45 -0500 Subject: [PATCH 129/198] fix --- include/flexflow/ops/kernels/lora_linear_kernels.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index cf03e518fa..739b94ed22 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -19,7 +19,6 @@ class LoraLinearMeta : public OpMeta { public: LoraLinearMeta(FFHandler handle, LoraLinear const *li); ~LoraLinearMeta(void); - char op_name[MAX_OPNAME]; // PEFT related fields void *low_rank_activation; void *input_activation; From 571f0d375a6fde72267a72ded40878706ab8ab17 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 29 Jan 2024 05:39:00 +0000 Subject: [PATCH 130/198] fix pipeline parallelism --- src/runtime/inference_manager.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 4f7d0c9632..e82347c981 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -276,9 +276,9 @@ void InferenceManager::init_operators_inference(FFModel *model) { assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); assert(tensor_buffer[op->outputs[i]].size() > batch_index); outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; - if (i > 0) { - assert(outputs[0]->machine_view == outputs[i]->machine_view); - } + // if (i > 0) { + // assert(outputs[0]->machine_view == outputs[i]->machine_view); + // } assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); } if (op->is_parallel_op()) { From f4a10f3316d0d9f41f2b1dcad97a1618840cfc51 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 29 Jan 2024 06:57:15 +0000 Subject: [PATCH 131/198] fix tp issue in combine op --- src/parallel_ops/combine.cc | 3 --- src/runtime/model.cc | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 3433e2f21b..8411b42602 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -84,9 +84,6 @@ Combine::Combine(FFModel &model, dims[i] = _input->dims[i]; } assert(combine_degree > 0 && "Must use combine_degree > 0"); - std::cout << "combine_dim : " << combine_dim - << ", dims[combine_dim].degree: " << dims[combine_dim].degree - << ", combine_degree: " << combine_degree << std::endl; assert(dims[combine_dim].degree % combine_degree == 0); dims[combine_dim].degree /= combine_degree; ParallelTensorBase::update_parallel_ids(numdim, dims); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 9512a0c21a..81cf3d966d 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3283,11 +3283,11 @@ void FFModel::create_operators_from_layers() { inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); } Op *op = nullptr; - // add a combine before arg_topk + // add a combine before arg_topk / argmax if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && - (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX || - l->op_type == OP_ARGMAX)) { + (layer_idx == layers.size() - 1 && + (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX))) { std::vector partitioned_inputs; assert(inputs.size() == 1); Combine *comb = new Combine(*this, From ca683f7fca21997e9b3c61a9f331ed6ca1c4ec81 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 29 Jan 2024 07:33:22 +0000 Subject: [PATCH 132/198] fix lora weight loading with tensor parallelism --- src/ops/lora_linear.cc | 11 ++++++----- src/runtime/inference_manager.cc | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 409c814329..81dc2292f6 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -270,6 +270,7 @@ void LoraLinear::register_peft_model( template void load_peft_from_file(DT *ptr, size_t size, + bool sharded, int shard_id, std::string filepath) { std::ifstream in(filepath, std::ios::in | std::ios::binary); @@ -279,7 +280,7 @@ void load_peft_from_file(DT *ptr, assert(in.good() && "incorrect weight file path"); std::vector
host_array(size); size_t target_data_size = sizeof(DT) * size; - in.seekg(shard_id * target_data_size, in.beg); + in.seekg(sharded * shard_id * target_data_size, in.beg); in.read((char *)host_array.data(), target_data_size); size_t in_get_size = in.gcount(); @@ -362,23 +363,23 @@ void LoraLinear::register_model_task(Task const *task, << ", size: " << w0_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file( - (float *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath); + (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" << ", size: " << w1_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file( - (float *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath); + (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); } else if (dt == DT_HALF) { std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" << ", size: " << w0_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file( - (half *)weight.w0_ptr, w0_num_elements, shard_id, w0_filepath); + (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" << ", size: " << w1_num_elements << ", shard: " << shard_id << std::endl; load_peft_from_file( - (half *)weight.w1_ptr, w1_num_elements, shard_id, w1_filepath); + (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); } else { assert(false && "Data type not supported"); } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index e82347c981..9fe9066d6c 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -390,12 +390,22 @@ void InferenceManager::peft_bwd(FFModel *model, while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { last_op -= 1; } - // Assert that the previous operator must be softmax - assert(model->operators[last_op]->op_type == OP_SOFTMAX || - model->operators[last_op]->op_type == OP_FUSED); - if (model->operators[last_op]->op_type == OP_FUSED) { - FusedOp *fused_op = static_cast(model->operators[last_op]); - assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX); + if (model->config.tensor_parallelism_degree > 1) { + if (model->operators[last_op]->op_type == OP_FUSED) { + FusedOp *fused_op = static_cast(model->operators[last_op]); + assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_COMBINE); + assert(fused_op->op_op_type[fused_op->numOperators - 2] == OP_SOFTMAX); + } else { + assert(model->operators[last_op]->op_type == OP_COMBINE) + assert(model->operators[last_op-1]->op_type == OP_SOFTMAX) + } + } else { + // Assert that the previous operator must be softmax + assert(model->operators[last_op]->op_type == OP_SOFTMAX || model->operators[last_op]->op_type == OP_FUSED); + if (model->operators[last_op]->op_type == OP_FUSED) { + FusedOp *fused_op = static_cast(model->operators[last_op]); + assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX); + } } for (int o = last_op; o >= 0; o--) { Op *op = model->operators[o]; From 378bdb5ba157f18d528c65aa0c7a7dba2ec26c08 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 29 Jan 2024 07:45:10 +0000 Subject: [PATCH 133/198] fixes, implement Combine::peft_bwd_task --- include/flexflow/model.h | 1 + include/flexflow/parallel_ops/combine.h | 9 +++ src/ops/lora_linear.cc | 7 +-- src/parallel_ops/combine.cc | 76 +++++++++++++++++++++++++ src/runtime/inference_manager.cc | 7 ++- src/runtime/model.cc | 15 +++++ 6 files changed, 107 insertions(+), 8 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 851fac94d2..73c985f757 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -240,6 +240,7 @@ enum TaskIDs { COMBINE_INIT_TASK_ID, COMBINE_FWD_TASK_ID, COMBINE_BWD_TASK_ID, + COMBINE_PEFT_BWD_TASK_ID, REPLICATE_INIT_TASK_ID, REPLICATE_FWD_TASK_ID, REPLICATE_BWD_TASK_ID, diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h index 2e4fdb86a9..cca34de119 100644 --- a/include/flexflow/parallel_ops/combine.h +++ b/include/flexflow/parallel_ops/combine.h @@ -40,6 +40,11 @@ class Combine : public ParallelOp { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( @@ -56,6 +61,10 @@ class Combine : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); template static void forward_task_with_type(Legion::Task const *task, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 81dc2292f6..366eca27b7 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -268,11 +268,8 @@ void LoraLinear::register_peft_model( } template -void load_peft_from_file(DT *ptr, - size_t size, - bool sharded, - int shard_id, - std::string filepath) { +void load_peft_from_file( + DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) { std::ifstream in(filepath, std::ios::in | std::ios::binary); if (!in.good()) { printf("Could not open file: %s\n", filepath.c_str()); diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 8411b42602..7d56d7e46b 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -275,6 +275,47 @@ void Combine::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Combine::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + void Combine::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -397,6 +438,41 @@ void Combine::forward_task_with_type(Task const *task, forward_kernel
(input_ptr, output_ptr, output_domain.get_volume()); } +void Combine::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + CombineMeta const *m = *((CombineMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + DataType data_type = output_grad.data_type; + assert(input_grad.data_type == data_type); + assert(output_grad.domain == input_grad.domain); + if (data_type == DT_FLOAT) { + backward_kernel(output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_DOUBLE) { + backward_kernel(output_grad.get_double_ptr(), + input_grad.get_double_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_INT32) { + backward_kernel(output_grad.get_int32_ptr(), + input_grad.get_int32_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_INT64) { + backward_kernel(output_grad.get_int64_ptr(), + input_grad.get_int64_ptr(), + output_grad.domain.get_volume()); + } else { + assert(false && "Unsupported data type in Combine backward"); + } +} + void Combine::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 9fe9066d6c..ae3b7eaa14 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -396,12 +396,13 @@ void InferenceManager::peft_bwd(FFModel *model, assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_COMBINE); assert(fused_op->op_op_type[fused_op->numOperators - 2] == OP_SOFTMAX); } else { - assert(model->operators[last_op]->op_type == OP_COMBINE) - assert(model->operators[last_op-1]->op_type == OP_SOFTMAX) + assert(model->operators[last_op]->op_type == OP_COMBINE); + assert(model->operators[last_op - 1]->op_type == OP_SOFTMAX); } } else { // Assert that the previous operator must be softmax - assert(model->operators[last_op]->op_type == OP_SOFTMAX || model->operators[last_op]->op_type == OP_FUSED); + assert(model->operators[last_op]->op_type == OP_SOFTMAX || + model->operators[last_op]->op_type == OP_FUSED); if (model->operators[last_op]->op_type == OP_FUSED) { FusedOp *fused_op = static_cast(model->operators[last_op]); assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 81cf3d966d..42283f570e 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6726,6 +6726,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(COMBINE_PEFT_BWD_TASK_ID, + "Combine PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Combine PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // Replicate { TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); From afdae452ad1502f4f1d4ad01ca2d19380ad0fc22 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 29 Jan 2024 07:52:33 +0000 Subject: [PATCH 134/198] fix --- src/parallel_ops/combine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 7d56d7e46b..7260a2745e 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -313,7 +313,7 @@ FutureMap Combine::peft_bwd(FFModel const &ff, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); } void Combine::backward(FFModel const &ff) { From 5660f55d8e60ccebfb02a71255ede13e4e8fdf83 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 29 Jan 2024 08:50:55 +0000 Subject: [PATCH 135/198] replicate peft bwd --- include/flexflow/model.h | 1 + include/flexflow/parallel_ops/replicate.h | 9 ++++ src/parallel_ops/replicate.cc | 65 +++++++++++++++++++++++ src/runtime/model.cc | 14 +++++ 4 files changed, 89 insertions(+) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 73c985f757..974a079ddb 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -244,6 +244,7 @@ enum TaskIDs { REPLICATE_INIT_TASK_ID, REPLICATE_FWD_TASK_ID, REPLICATE_BWD_TASK_ID, + REPLICATE_PEFT_BWD_TASK_ID, REDUCTION_INIT_TASK_ID, REDUCTION_FWD_TASK_ID, REDUCTION_BWD_TASK_ID, diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 65d69d8564..c27616634f 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -54,10 +54,19 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_kernel_wrapper(ReplicateMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 20face74e8..701db40b49 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -273,6 +273,45 @@ void Replicate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Replicate::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + void Replicate::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -378,6 +417,32 @@ void Replicate::forward_task(Task const *task, } } +void Replicate::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + Domain output_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain input_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + // Currently only support the outter most dimension + for (int i = 0; i < output_grad_domain.get_dim() - 1; i++) { + assert(output_grad_domain.lo()[i] == input_grad_domain.lo()[i]); + assert(output_grad_domain.hi()[i] == input_grad_domain.hi()[i]); + } + size_t num_elements = input_grad_domain.get_volume(); + size_t num_replicas = output_grad_domain.get_volume() / num_elements; + float const *output_grad_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + float *input_grad_ptr = helperGetTensorPointerRW( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + backward_kernel( + output_grad_ptr, input_grad_ptr, num_elements, num_replicas); +} + void Replicate::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 42283f570e..11311053e9 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6784,6 +6784,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID, "Replicate PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Replicate PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // Reduction { TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init"); From a9bacd31ab937a364ec926c9339f970c9e918b6c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 30 Jan 2024 05:54:43 +0000 Subject: [PATCH 136/198] fixes --- .../flexflow/ops/kernels/softmax_kernels.h | 1 + include/flexflow/parallel_ops/parallel_op.h | 2 +- src/ops/fused.cu | 1 + src/ops/kernels/softmax.cu | 25 ++++++---- src/ops/softmax.cc | 46 +++++++++++++------ src/parallel_ops/combine.cc | 30 +++++++++++- src/parallel_ops/partition.cc | 5 ++ src/parallel_ops/reduction.cc | 7 +++ src/parallel_ops/replicate.cc | 36 +++++++++------ src/runtime/model.cc | 3 +- 10 files changed, 114 insertions(+), 42 deletions(-) diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index b3dfe4f430..0b7f1090f6 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -38,6 +38,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, void inference_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, + bool is_last_op, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, GenericTensorAccessorW const &output_grad); diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h index 0bf573996c..39324c2a51 100644 --- a/include/flexflow/parallel_ops/parallel_op.h +++ b/include/flexflow/parallel_ops/parallel_op.h @@ -41,7 +41,7 @@ class ParallelOp : public Op { public: Legion::LogicalPartition input_lp, output_grad_lp; std::unordered_map - inference_input_lps; + inference_input_lps, inference_output_grad_lps; }; }; // namespace FlexFlow diff --git a/src/ops/fused.cu b/src/ops/fused.cu index f6bed71f6a..55892ab7e9 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -642,6 +642,7 @@ __host__ void Kernels::Softmax::inference_kernel_wrapper( m, bc, + (op == fused->numOperators - 1), my_input_accessor[0], my_output_accessor[0], output_accessor[fused->numOutputs]); diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 271a291b09..c8bc242af0 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -120,6 +120,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, void inference_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, + bool is_last_op, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, GenericTensorAccessorW const &output_grad) { @@ -139,11 +140,13 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_float_ptr(), num_classes, stream); - checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(), - output.get_float_ptr(), - output.domain.get_volume() * sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); + if (is_last_op) { + checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(), + output.get_float_ptr(), + output.domain.get_volume() * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } } else if (m->output_type[0] == DT_HALF) { Internal::inference_kernel(m, bc, @@ -151,11 +154,13 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_half_ptr(), num_classes, stream); - checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(), - output.get_half_ptr(), - output.domain.get_volume() * sizeof(half), - cudaMemcpyDeviceToDevice, - stream)); + if (is_last_op) { + checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(), + output.get_half_ptr(), + output.domain.get_volume() * sizeof(half), + cudaMemcpyDeviceToDevice, + stream)); + } } else { assert(false && "Unsupported data type"); } diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 1d062b552b..cfc3cf6e40 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -355,14 +355,25 @@ FutureMap Softmax::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - // we add the region below in order to copy the output to the grad tensor - launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region_grad)); - launcher.add_field(2, FID_DATA); + // if this is the last operator, we add the region below in order to copy the + // output to the grad tensor + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + int last_op = ff.operators.size() - 1; + assert(ff.operators[last_op]->op_type == OP_ARGMAX || + ff.operators[last_op]->op_type == OP_SAMPLING); + last_op -= 1; + while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { + last_op -= 1; + } + if (ff.operators[last_op] == this) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } @@ -371,8 +382,8 @@ void Softmax::inference_task(Task const *task, Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); - assert(regions.size() == 3); - assert(task->regions.size() == 3); + assert(regions.size() == 3 || regions.size() == 2); + bool is_last_op = (regions.size() == 3); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; @@ -384,9 +395,16 @@ void Softmax::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - inference_kernel_wrapper(m, bc, input, output, output_grad); + GenericTensorAccessorW output_grad; + if (is_last_op) { + output_grad = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + } + inference_kernel_wrapper(m, bc, is_last_op, input, output, output_grad); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -428,7 +446,7 @@ FutureMap Softmax::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 7260a2745e..737998b141 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -205,6 +205,11 @@ void Combine::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } FutureMap Combine::inference(FFModel const &ff, @@ -244,6 +249,25 @@ FutureMap Combine::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); + // if this is the last operator, we add the region below in order to copy the + // output to the grad tensor + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + int last_op = ff.operators.size() - 1; + assert(ff.operators[last_op]->op_type == OP_ARGMAX || + ff.operators[last_op]->op_type == OP_SAMPLING); + last_op -= 1; + while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { + last_op -= 1; + } + if (ff.operators[last_op] == this) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } @@ -300,7 +324,7 @@ FutureMap Combine::peft_bwd(FFModel const &ff, 0 /*mapper_id*/, machine_view_hash); launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, + RegionRequirement(inference_output_grad_lps[batch_outputs[0]], 0 /*projection id*/, READ_ONLY, EXCLUSIVE, @@ -309,7 +333,7 @@ FutureMap Combine::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + WRITE_ONLY, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); @@ -400,6 +424,7 @@ void Combine::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + printf("INF combine\n"); assert(regions.size() == 2); assert(task->regions.size() == 2); CombineMeta const *m = *((CombineMeta **)task->local_args); @@ -442,6 +467,7 @@ void Combine::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + printf("BWD combine\n"); assert(regions.size() == 2); assert(task->regions.size() == 2); CombineMeta const *m = *((CombineMeta **)task->local_args); diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index 353b3ce398..df3c56346c 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -197,6 +197,11 @@ void Repartition::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } FutureMap diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 5dca591328..2e7b4b6723 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -122,6 +122,13 @@ void Reduction::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // output_grad_lp is an aliased partitioning along the replica dim + ff.create_aliased_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + reduction_dim, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } OpMeta *Reduction::init_task(Task const *task, diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index 701db40b49..e4f19faa0a 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -122,6 +122,12 @@ void Replicate::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // output_grad_lp is a disjoint partition + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } OpMeta *Replicate::init_task(Task const *task, @@ -274,10 +280,10 @@ void Replicate::forward(FFModel const &ff) { } FutureMap Replicate::peft_bwd(FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; @@ -297,17 +303,19 @@ FutureMap Replicate::peft_bwd(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_outputs[0]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(inference_output_grad_lps[batch_outputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(1, FID_DATA); return runtime->execute_index_space(ctx, launcher); } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 11311053e9..eca8c31785 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6785,7 +6785,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID, "Replicate PEFT Backward"); + TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID, + "Replicate PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { From f3a97ff3832261393b4c7f0c6231fe292b4964c9 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 31 Jan 2024 02:45:49 +0000 Subject: [PATCH 137/198] fix --- src/parallel_ops/combine.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 737998b141..354faa5e1a 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -261,7 +261,7 @@ FutureMap Combine::inference(FFModel const &ff, } if (ff.operators[last_op] == this) { launcher.add_region_requirement( - RegionRequirement(batch_outputs[0]->part_grad, + RegionRequirement(inference_output_grad_lps[batch_outputs[0]], 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, @@ -424,9 +424,8 @@ void Combine::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - printf("INF combine\n"); - assert(regions.size() == 2); - assert(task->regions.size() == 2); + // assert(regions.size() == 2); + // assert(task->regions.size() == 2); CombineMeta const *m = *((CombineMeta **)task->local_args); DataType data_type = m->input_type[0]; if (data_type == DT_HALF) { From e0a58bb73364660be05aa8162e960399b5f9d557 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 31 Jan 2024 23:48:02 +0000 Subject: [PATCH 138/198] fix combine and fwd-bwd pass dependencies --- include/flexflow/model.h | 1 + src/ops/softmax.cc | 1 + src/parallel_ops/allreduce.cc | 2 +- src/parallel_ops/combine.cc | 60 +++++++++++++++----------------- src/runtime/inference_manager.cc | 24 ++++--------- src/runtime/model.cc | 36 ++++++++++++++++--- 6 files changed, 69 insertions(+), 55 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 974a079ddb..b3a6a85808 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1124,6 +1124,7 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool need_to_add_combine(int layer_idx) const; bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index cfc3cf6e40..90f77ab760 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -360,6 +360,7 @@ FutureMap Softmax::inference(FFModel const &ff, assert(ff.config.computationMode == COMP_MODE_INFERENCE); int last_op = ff.operators.size() - 1; assert(ff.operators[last_op]->op_type == OP_ARGMAX || + ff.operators[last_op]->op_type == OP_ARG_TOPK || ff.operators[last_op]->op_type == OP_SAMPLING); last_op -= 1; while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 4478a2aedc..05c2761e3b 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -365,7 +365,7 @@ FutureMap AllReduce::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, 0 /*projection id*/, - READ_WRITE, + WRITE_ONLY, EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index 354faa5e1a..a328ec7cac 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -199,12 +199,18 @@ void Combine::create_input_partition_inference( assert(ff.config.computationMode == COMP_MODE_INFERENCE); assert(batch_outputs[0]->part != LogicalPartition::NO_PART); assert(batch_inputs[0]->part != LogicalPartition::NO_PART); - // input_lp is a disjoint partition + // partition batch_inputs[0]->region into inference_input_lps[batch_inputs[0]] + // according to the partitioning of batch_outputs[0] (i.e. make the + // partitioned dimension whole again by combining the partitions) ff.create_disjoint_partition(batch_outputs[0]->num_dims, batch_outputs[0]->dims, batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // partition batch_outputs[0]->region_grad into + // inference_output_grad_lps[batch_outputs[0]] according to the partitioning + // of batch_inputs[0] (i.e. restore the partition in the dimension that was + // combined in the forward pass) ff.create_disjoint_partition(batch_inputs[0]->num_dims, batch_inputs[0]->dims, batch_inputs[0]->parallel_is, @@ -249,25 +255,6 @@ FutureMap Combine::inference(FFModel const &ff, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(1, FID_DATA); - // if this is the last operator, we add the region below in order to copy the - // output to the grad tensor - assert(ff.config.computationMode == COMP_MODE_INFERENCE); - int last_op = ff.operators.size() - 1; - assert(ff.operators[last_op]->op_type == OP_ARGMAX || - ff.operators[last_op]->op_type == OP_SAMPLING); - last_op -= 1; - while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { - last_op -= 1; - } - if (ff.operators[last_op] == this) { - launcher.add_region_requirement( - RegionRequirement(inference_output_grad_lps[batch_outputs[0]], - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region_grad)); - launcher.add_field(2, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } @@ -310,23 +297,28 @@ FutureMap Combine::peft_bwd(FFModel const &ff, assert(numOutputs == 1); assert(numInputs == 1); assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); - DataType data_type = batch_inputs[0]->data_type; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + DataType data_type = inputs[0]->data_type; + + // Warning: we need to use batch_inputs[0] here, instead of the usual + // batch_outputs[0] + parallel_is = batch_inputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(&data_type, sizeof(DataType)), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement( RegionRequirement(inference_output_grad_lps[batch_outputs[0]], 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(0, FID_DATA); @@ -424,8 +416,8 @@ void Combine::forward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - // assert(regions.size() == 2); - // assert(task->regions.size() == 2); + assert(regions.size() == 2); + assert(task->regions.size() == 2); CombineMeta const *m = *((CombineMeta **)task->local_args); DataType data_type = m->input_type[0]; if (data_type == DT_HALF) { @@ -466,15 +458,19 @@ void Combine::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - printf("BWD combine\n"); assert(regions.size() == 2); assert(task->regions.size() == 2); - CombineMeta const *m = *((CombineMeta **)task->local_args); + // CombineMeta const *m = *((CombineMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + // TODO: figure out why m->output_type[0] or m->input_type[0] are not working + DataType data_type = *((DataType *)task->args); GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - DataType data_type = output_grad.data_type; + data_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input_grad.data_type == data_type); assert(output_grad.domain == input_grad.domain); if (data_type == DT_FLOAT) { diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index ae3b7eaa14..066701f65c 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -385,28 +385,18 @@ void InferenceManager::peft_bwd(FFModel *model, int last_op = model->operators.size() - 1; // Assert that the last operator must be argmax or sampling assert(model->operators[last_op]->op_type == OP_ARGMAX || + model->operators[last_op]->op_type == OP_ARG_TOPK || model->operators[last_op]->op_type == OP_SAMPLING); last_op -= 1; while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { last_op -= 1; } - if (model->config.tensor_parallelism_degree > 1) { - if (model->operators[last_op]->op_type == OP_FUSED) { - FusedOp *fused_op = static_cast(model->operators[last_op]); - assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_COMBINE); - assert(fused_op->op_op_type[fused_op->numOperators - 2] == OP_SOFTMAX); - } else { - assert(model->operators[last_op]->op_type == OP_COMBINE); - assert(model->operators[last_op - 1]->op_type == OP_SOFTMAX); - } - } else { - // Assert that the previous operator must be softmax - assert(model->operators[last_op]->op_type == OP_SOFTMAX || - model->operators[last_op]->op_type == OP_FUSED); - if (model->operators[last_op]->op_type == OP_FUSED) { - FusedOp *fused_op = static_cast(model->operators[last_op]); - assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX); - } + // Assert that the previous operator must be softmax + assert(model->operators[last_op]->op_type == OP_SOFTMAX || + model->operators[last_op]->op_type == OP_FUSED); + if (model->operators[last_op]->op_type == OP_FUSED) { + FusedOp *fused_op = static_cast(model->operators[last_op]); + assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX); } for (int o = last_op; o >= 0; o--) { Op *op = model->operators[o]; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index eca8c31785..6d77730e47 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3270,6 +3270,34 @@ bool FFModel::is_mlp_block(int layer_idx) const { return false; } +bool FFModel::need_to_add_combine(int layer_idx) const { + if (config.computationMode != COMP_MODE_INFERENCE || + config.tensor_parallelism_degree == 1 || layers.size() <= 2) { + return false; + } + auto const &l = layers[layer_idx]; + // softmax followed by argmax/arg_topk: add combine before softmax + if (layer_idx == layers.size() - 2) { + auto const &l_next = layers[layer_idx + 1]; + if (l->op_type == OP_SOFTMAX && + (l_next->op_type == OP_ARG_TOPK || l_next->op_type == OP_ARGMAX)) { + return true; + } else { + return false; + } + } + // argmax/arg_topk not precedent by softmax: add combine before + // argmax/arg_topk + if (layer_idx == layers.size() - 1 && + (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX)) { + auto const &l_prev = layers[layer_idx - 1]; + if (l_prev->op_type == OP_SOFTMAX) { + return false; + } + return true; + } + return false; +} void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; // for (auto const &l : layers) { @@ -3283,11 +3311,9 @@ void FFModel::create_operators_from_layers() { inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); } Op *op = nullptr; - // add a combine before arg_topk / argmax - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - (layer_idx == layers.size() - 1 && - (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX))) { + // add a combine before last arg_max / arg_topk or before second-to-last + // softmax + if (need_to_add_combine(layer_idx)) { std::vector partitioned_inputs; assert(inputs.size() == 1); Combine *comb = new Combine(*this, From 50fc13d20a8175720e031b785ec21b9a3248722d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 31 Jan 2024 23:53:23 +0000 Subject: [PATCH 139/198] fix replicate bwd --- src/parallel_ops/replicate.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index e4f19faa0a..2a3818e212 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -291,8 +291,12 @@ FutureMap Replicate::peft_bwd(FFModel const &ff, assert(numInputs == 1); assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); DataType data_type = batch_inputs[0]->data_type; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + + // Warning: we need to use batch_inputs[0] here, instead of the usual + // batch_outputs[0] + parallel_is = batch_inputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID, From f2c9a052ddbf4c469f2755c224d0d2faaa1509c3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 1 Feb 2024 04:58:32 +0000 Subject: [PATCH 140/198] fix --- src/runtime/inference_manager.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 066701f65c..66c47e6559 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -391,13 +391,6 @@ void InferenceManager::peft_bwd(FFModel *model, while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { last_op -= 1; } - // Assert that the previous operator must be softmax - assert(model->operators[last_op]->op_type == OP_SOFTMAX || - model->operators[last_op]->op_type == OP_FUSED); - if (model->operators[last_op]->op_type == OP_FUSED) { - FusedOp *fused_op = static_cast(model->operators[last_op]); - assert(fused_op->op_op_type[fused_op->numOperators - 1] == OP_SOFTMAX); - } for (int o = last_op; o >= 0; o--) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { From cd68f5d0cf6348410b718283517e2cfa947309ee Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 3 Feb 2024 16:35:54 +0000 Subject: [PATCH 141/198] let user control amount of peft memory --- include/flexflow/config.h | 1 + inference/python/incr_decoding.py | 5 ++- inference/python/spec_infer.py | 5 ++- python/flexflow/core/__init__.py | 5 ++- python/flexflow/serve/__init__.py | 32 +++++++++++++++++-- src/runtime/model.cc | 19 +++++++++-- src/runtime/model.cu | 4 +++ .../python_test_configs/generate_configs.py | 5 ++- 8 files changed, 67 insertions(+), 9 deletions(-) diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 2f112d4fc9..9bb230132a 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -154,6 +154,7 @@ class FFConfig { size_t offload_reserve_space_size; DataType quantization_type; // PEFT related fields + bool enable_peft; size_t peft_activation_reserve_space_size; size_t peft_weight_reserve_space_size; // Control parallelizable dimensions diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 4a146ab503..ed57453762 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -51,9 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 4, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "inference_debugging": False, "fusion": True, diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index c9fb5cc7bb..b31ddf4604 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -51,9 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 2, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "inference_debugging": False, "fusion": True, diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index d7b1a595d2..2614518acf 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -87,7 +87,10 @@ "offload": "-offload", "offload_reserve_space_size": "-offload-reserve-space-size", "use_4bit_quantization": "--4bit-quantization", - "use_8bit_quantization": "--8bit-quantization" + "use_8bit_quantization": "--8bit-quantization", + "enable_peft": "", + "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size", + "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size", } diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 274b431ad8..5805670ae0 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -44,6 +44,9 @@ def init( offload_reserve_space_size: Optional[int] = None, use_4bit_quantization: Optional[bool] = None, use_8bit_quantization: Optional[bool] = None, + enable_peft: Optional[bool] = None, + peft_activation_reserve_space_size: Optional[int] = None, + peft_weight_reserve_space_size: Optional[int] = None, profiling: Optional[bool] = None, inference_debugging: Optional[bool] = None, fusion: Optional[bool] = None, @@ -68,9 +71,12 @@ def init( - tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1 - pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1 - offload: whether to enable offloading of the weights to CPU, defaults to False - - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2 + - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB - use_4bit_quantization: whether to use 4-bit quantization, defaults to False - use_8bit_quantization: whether to use 8-bit quantization, defaults to False + - enable_peft: whether to enable the use of PEFT, defaults to False + - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB + - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB - profiling: whether to enable the FlexFlow profiling mode, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True @@ -98,12 +104,18 @@ def init( :type pipeline_parallelism_degree: Optional[int], optional :param offload: whether to enable offloading of the weights to CPU, defaults to False :type offload: Optional[bool], optional - :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2 + :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB :type offload_reserve_space_size: Optional[int], optional :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False :type use_4bit_quantization: Optional[bool], optional :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False :type use_8bit_quantization: Optional[bool], optional + :param enable_peft: whether to enable the use of PEFT, defaults to False + :type enable_peft: Optional[bool], optional + :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB + :type peft_activation_reserve_space_size: Optional[int], optional + :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB + :type peft_weight_reserve_space_size: Optional[int], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False @@ -131,6 +143,9 @@ def init( offload_reserve_space_size is not None, use_4bit_quantization is not None, use_8bit_quantization is not None, + enable_peft is not None, + peft_activation_reserve_space_size is not None, + peft_weight_reserve_space_size is not None, profiling is not None, inference_debugging is not None, fusion is not None, @@ -156,6 +171,9 @@ def init( "offload_reserve_space_size": offload_reserve_space_size, "use_4bit_quantization": use_4bit_quantization, "use_8bit_quantization": use_8bit_quantization, + "enable_peft": enable_peft, + "peft_activation_reserve_space_size": peft_activation_reserve_space_size, + "peft_weight_reserve_space_size": peft_weight_reserve_space_size, "profiling": profiling, "inference_debugging": inference_debugging, "fusion": fusion, @@ -176,6 +194,8 @@ def init( "tensor_parallelism_degree", "pipeline_parallelism_degree", "offload_reserve_space_size", + "peft_activation_reserve_space_size", + "peft_weight_reserve_space_size", ] for param in positive_int_params: __check_positive_int(configs_dict, param) @@ -194,11 +214,17 @@ def init( if configs_dict.get("offload", None) is None: configs_dict["offload"] = False if configs_dict.get("offload_reserve_space_size", None) is None: - configs_dict["offload_reserve_space_size"] = 1024**2 + configs_dict["offload_reserve_space_size"] = 8*1024**3 if configs_dict.get("use_4bit_quantization", None) is None: configs_dict["use_4bit_quantization"] = False if configs_dict.get("use_8bit_quantization", None) is None: configs_dict["use_8bit_quantization"] = False + if configs_dict.get("enable_peft", None) is None: + configs_dict["enable_peft"] = False + if configs_dict.get("peft_activation_reserve_space_size", None) is None: + configs_dict["peft_activation_reserve_space_size"] = 8*1024**3 + if configs_dict.get("peft_weight_reserve_space_size", None) is None: + configs_dict["peft_weight_reserve_space_size"] = 1024**3 if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False if configs_dict.get("inference_debugging", None) is None: diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 6d77730e47..e73415faaf 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1524,8 +1524,9 @@ FFRuntime::FFRuntime(FFConfig &config) { info.offload_reserve_space_size = config.cpu_offload ? config.offload_reserve_space_size : 0; info.peft_activation_reserve_space_size = - config.peft_activation_reserve_space_size; - info.peft_weight_reserve_space_size = config.peft_weight_reserve_space_size; + config.enable_peft ? config.peft_activation_reserve_space_size : 0; + info.peft_weight_reserve_space_size = + config.enable_peft ? config.peft_weight_reserve_space_size : 0; info.quantization_type = config.quantization_type; info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion; argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo))); @@ -4062,6 +4063,7 @@ struct DefaultConfig { const static size_t offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB // PEFT related fields + const static bool enablePeft = false; const static size_t peftActivationReserveSpaceSize = (size_t)1 * 1024 * 1024 * 1024; // 1GB const static size_t peftWeightReserveSpaceSize = @@ -4102,6 +4104,7 @@ FFConfig::FFConfig() { cpu_offload = DefaultConfig::cpuOffload; offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize; // PEFT related fields + enable_peft = DefaultConfig::enablePeft; peft_activation_reserve_space_size = DefaultConfig::peftActivationReserveSpaceSize; peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize; @@ -4227,6 +4230,18 @@ void FFConfig::parse_args(char **argv, int argc) { quantization_type = DT_INT8; continue; } + if ((!strcmp(argv[i], "-enable-peft"))) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-activation-reserve-space-size")) { + peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; + continue; + } + if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) { + peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; + continue; + } if ((!strcmp(argv[i], "--only-data-parallel"))) { only_data_parallel = true; continue; diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 754a6b18d7..80f4fdf143 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -164,6 +164,8 @@ FFHandler handle.peft_activation_allocator = new MemoryAllocator(gpu_mem); handle.peft_activation_allocator->create_legion_instance( workspaceInst, info->peft_activation_reserve_space_size); + } else { + handle.peft_activation_allocator = nullptr; } if (info->peft_weight_reserve_space_size > 0) { @@ -188,6 +190,8 @@ FFHandler void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); handle.peft_weight_allocator = new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size); + } else { + handle.peft_weight_allocator = nullptr; } // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index ebaadade32..b5cad16c65 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -14,9 +14,12 @@ "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 4, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8 GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "inference_debugging": False, "fusion": True, From 64a59d891ae3db48c8234af9bf46fadf48c4bd9b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 3 Feb 2024 17:17:56 +0000 Subject: [PATCH 142/198] only run peft_bwd if peft is enabled --- src/runtime/request_manager.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index cbb21e03e0..2eebc070d6 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2206,7 +2206,9 @@ GenerationResult RequestManager::generate_incr_decoding( BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second); FutureMap fm = im->inference(llm, 0, bcf); - im->peft_bwd(llm, 0, bcf); + if (llm->config.enable_peft) { + im->peft_bwd(llm, 0, bcf); + } assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); batch_pipeline.push(std::make_pair(bcf, irf)); From 32a07165cf1a68e8b15c8f591a66c397888712ec Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 6 Feb 2024 05:46:25 +0000 Subject: [PATCH 143/198] fix rms norm inference region reqs --- src/ops/residual_rms_norm.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index e549e5f6da..264c12f004 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -394,7 +394,7 @@ FutureMap assert(batch_outputs[0]->region == batch_inputs[0]->region); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_ONLY, + READ_WRITE, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -412,7 +412,7 @@ FutureMap launcher.add_field(2, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, weights[0]->region)); launcher.add_field(3, FID_DATA); @@ -423,7 +423,7 @@ FutureMap regions[0](I/O): input1 / residual output regions[1](I): input2 regions[2](O): output - regions[3](I/O): weight + regions[3](I): weight */ void ResidualRMSNorm::inference_task(Task const *task, std::vector const ®ions, From a37b173adebb0f90767a16b4421a9de6a2ba42ee Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 7 Feb 2024 06:37:37 +0000 Subject: [PATCH 144/198] fix in-place fusion (part 1) --- src/ops/fused.cc | 140 +++++++++++++++++++++++++++++++++++++++---- src/runtime/model.cc | 15 ++++- 2 files changed, 143 insertions(+), 12 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 8afd61aece..5f15e0b1cb 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/fused.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" @@ -87,12 +88,32 @@ FusedOp::FusedOp(FFModel &model, Op *op) // weights[i]->owner_idx = i; weight_data_types[i] = op->weights[i]->data_type; } - numOutputs = op->numOutputs; - for (int i = 0; i < numOutputs; i++) { - outputs[i] = op->outputs[i]; - outputs[i]->owner_op = this; - outputs[i]->owner_idx = i; - output_data_types[i] = op->outputs[i]->data_type; + numOutputs = 0; + for (int i = 0; i < op->numOutputs; i++) { + bool found = false; + // Handle in-place outputs + for (int j = 0; j < numInputs; j++) { + if (inputs[j]->region == op->outputs[i]->region) { + // This output is one of the inputs + assert(!found); + assert(inputs[j]->region != LogicalRegion::NO_REGION); + op_output_source[i] = SOURCE_INPUT; + op_input_idx[i] = j; + found = true; + break; + } + } + if (found) { + // do nothing + } else { + outputs[numOutputs] = op->outputs[i]; + output_data_types[numOutputs] = op->outputs[i]->data_type; + op_output_source[i] = SOURCE_OUTPUT; + op_output_idx[i] = numOutputs; + outputs[numOutputs]->owner_op = this; + outputs[numOutputs]->owner_idx = numOutputs; + numOutputs++; + } } numOperators = 1; op_num_inputs[0] = op->numInputs; @@ -109,10 +130,53 @@ FusedOp::FusedOp(FFModel &model, Op *op) op_weight_source[i] = SOURCE_WEIGHT; op_weight_idx[i] = i; } - for (int i = 0; i < numOutputs; i++) { - op_output_source[i] = SOURCE_OUTPUT; - op_output_idx[i] = i; - } + // for (int i = 0; i < numOutputs; i++) { + // op_output_source[i] = SOURCE_OUTPUT; + // op_output_idx[i] = i; + // } +#if 0 + int input_offset = 0, weight_offset = 0, output_offset = 0; + printf("\nNew fused op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: " + "#inputs=%i, #outputs=%i, #weights=%i\n", + op->name, + get_operator_type_name(op->op_type).c_str(), + op->numInputs, + op->numOutputs, + op->numWeights, + numInputs, + numOutputs, + numWeights); + printf("op_input_idx:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_idx[i]); + } + printf("\n"); + printf("op_input_source:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_source[i]); + } + printf("\n"); + printf("op_output_idx:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_idx[i]); + } + printf("\n"); + printf("op_output_source:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_source[i]); + } + printf("\n"); + printf("op_weight_idx:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_idx[i]); + } + printf("\n"); + printf("op_weight_source:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_source[i]); + } + printf("\n"); +#endif } bool FusedOp::add_operator(FFModel &model, Op *op) { @@ -231,6 +295,18 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { found = true; op_output_source[output_offset + i] = SOURCE_OUTPUT; op_output_idx[output_offset + i] = j; + break; + } + } + for (int j = 0; j < numInputs; j++) { + if (inputs[j]->region == op->outputs[i]->region) { + // This input is one of my inputs + assert(!found); + assert(inputs[j]->region != LogicalRegion::NO_REGION); + op_output_source[output_offset + i] = SOURCE_INPUT; + op_output_idx[output_offset + i] = j; + found = true; + break; } } if (found) { @@ -271,6 +347,50 @@ bool FusedOp::add_operator(FFModel &model, Op *op) { "Reach to the #outputs limit during fusion.\n" "Consider increase MAX_NUM_OUTPUTS to allow more fusions.\n"); } + +#if 0 + printf("\nAdd op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: " + "#inputs=%i, #outputs=%i, #weights=%i\n", + op->name, + get_operator_type_name(op->op_type).c_str(), + op->numInputs, + op->numOutputs, + op->numWeights, + numInputs, + numOutputs, + numWeights); + printf("op_input_idx:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_idx[i]); + } + printf("\n"); + printf("op_input_source:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_source[i]); + } + printf("\n"); + printf("op_output_idx:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_idx[i]); + } + printf("\n"); + printf("op_output_source:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_source[i]); + } + printf("\n"); + printf("op_weight_idx:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_idx[i]); + } + printf("\n"); + printf("op_weight_source:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_source[i]); + } + printf("\n"); +#endif + return true; } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index e73415faaf..0a76f84445 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -2967,8 +2967,19 @@ bool FFModel::apply_fusion(std::vector const &operators, found = k; } } - assert(found >= 0); - op->inputs[idx] = fused_op->outputs[found]; + if (found >= 0) { + op->inputs[idx] = fused_op->outputs[found]; + } else { + for (int k = 0; k < fused_op->numInputs; k++) { + if (fused_op->inputs[k]->region == + op->inputs[idx]->region) { + assert(found == -1); + found = k; + } + } + assert(found >= 0); + op->inputs[idx] = fused_op->inputs[found]; + } } } // Insert op From 85f4d400142b29db74b89da749b765117bdf1b28 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 7 Feb 2024 06:51:53 +0000 Subject: [PATCH 145/198] fix inplace fusion (part 2) --- src/runtime/inference_manager.cc | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 66c47e6559..c7f2b6d5a9 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -664,9 +664,19 @@ void FFModel::compile_inference() { } for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(fused->outputs[my_off]->region == - old_op->outputs[i]->region); + assert( + fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT || + (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && + (old_op->op_type == OP_RESIDUAL_LAYERNORM || + old_op->op_type == OP_RESIDUAL_RMS_NORM || + old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); + if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) { + assert(fused->outputs[my_off]->region == + old_op->outputs[i]->region); + } else { + assert(fused->inputs[my_off]->region == + old_op->outputs[i]->region); + } } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; From bb56a993879b7ab4edffeecae0467179fc0d5595 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 7 Feb 2024 07:17:43 +0000 Subject: [PATCH 146/198] fix --- src/ops/fused.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 5f15e0b1cb..7d0d829e51 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -482,8 +482,13 @@ void FusedOp::init_inference(FFModel const &ff, } for (int i = 0; i < op_num_outputs[op]; i++) { int my_off = op_output_idx[i + ooff]; - assert(op_output_source[i + ooff] == SOURCE_OUTPUT); - my_batch_outputs.push_back(batch_outputs[my_off]); + if (op_output_source[i + ooff] == SOURCE_OUTPUT) { + my_batch_outputs.push_back(batch_outputs[my_off]); + } else if (op_output_source[i + ooff] == SOURCE_INPUT) { + my_batch_outputs.push_back(batch_inputs[my_off]); + } else { + assert(false); + } } ioff += op_num_inputs[op]; ooff += op_num_outputs[op]; From 63f1fcedde381283349a201e6800f3cb6836bfc7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 7 Feb 2024 21:38:17 +0000 Subject: [PATCH 147/198] disable automatic inplace rms norm for now --- include/flexflow/flexflow_c.h | 1 + include/flexflow/model.h | 1 + .../ops/kernels/residual_rms_norm_kernels.h | 1 + include/flexflow/ops/residual_rms_norm.h | 2 + .../flexflow/ops/residual_rms_norm_params.h | 1 + inference/models/llama.cc | 3 + python/flexflow/core/flexflow_cffi.py | 7 +- src/c/flexflow_c.cc | 11 +- src/ops/fused.cc | 4 - src/ops/kernels/residual_rms_norm_kernels.cu | 1 + src/ops/residual_rms_norm.cc | 205 +++++++++++++----- 11 files changed, 179 insertions(+), 58 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 01a2818a2b..6ce5876fa1 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -565,6 +565,7 @@ flexflow_tensor_t * const flexflow_tensor_t input2_, float eps, int dim, + bool inplace_residual, char const *name); flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, diff --git a/include/flexflow/model.h b/include/flexflow/model.h index b3a6a85808..ecad8034bc 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -617,6 +617,7 @@ class FFModel { Tensor *outputs, float eps, int dim, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a beam search top k layer diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 691f8ef8c1..6eb5c0ae21 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -32,6 +32,7 @@ class ResidualRMSNormMeta : public OpMeta { void *rms_ptr; void *norm_ptr; + bool inplace_residual; int in_dim; int batch_size; int num_elements; diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h index 2acc06841c..bf75cd573a 100644 --- a/include/flexflow/ops/residual_rms_norm.h +++ b/include/flexflow/ops/residual_rms_norm.h @@ -20,6 +20,7 @@ class ResidualRMSNorm : public Op { const ParallelTensor _input2, float _eps, int dim, + bool inplace_residual, bool allocate_weights, char const *name); ResidualRMSNorm(FFModel &model, @@ -96,6 +97,7 @@ class ResidualRMSNorm : public Op { float eps; int effective_batch_size; int dim, data_dim; + bool inplace_residual; }; } // namespace FlexFlow #endif // _FLEXFLOW_RESIDUAL_RMS_NORM_H diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h index a4e4de59ab..8b8f666dc1 100644 --- a/include/flexflow/ops/residual_rms_norm_params.h +++ b/include/flexflow/ops/residual_rms_norm_params.h @@ -11,6 +11,7 @@ struct ResidualRMSNormParams { LayerID layer_guid; float eps; int dim; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 9950d5b080..f4afb32e24 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -80,6 +80,7 @@ void LLAMA::create_llama_model(FFModel &ff, token_att_norm, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, std::string("layers_" + std::to_string(i) + "_attention_norm") .c_str()); @@ -171,6 +172,7 @@ void LLAMA::create_llama_model(FFModel &ff, token_ff_norm, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str()); token = token_ff_norm[0]; @@ -234,6 +236,7 @@ void LLAMA::create_llama_model(FFModel &ff, final_rms_norm_output, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, "norm"); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index de3f7e6929..f39e8f1e7e 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -3320,7 +3320,7 @@ def rms_norm(self, input, eps, dim, name=None): self.add_layer(OpType.RMS_NORM, name) return Tensor(handle, owner_op_type=OpType.RMS_NORM) - def residual_rms_norm(self, input1, input2, eps, dim, name=None): + def residual_rms_norm(self, input1, input2, eps, dim, inplace_residual=False, name=None): """Defines the Residual RMS Norm layer. :param input: the input 1 Tensor. @@ -3338,11 +3338,14 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None): :param name: the name of the layer. Default is None. :type name: string + :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False. + :type inplace_residual: bool + :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) handles_array = ffc().flexflow_model_add_residual_rms_norm( - self.handle, input1.handle, input2.handle, eps, dim, c_name + self.handle, input1.handle, input2.handle, eps, dim, inplace_residual, c_name ) self.add_layer(OpType.RESIDUAL_RMS_NORM, name) return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor( diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 8f5d197eb3..a7d081bd1a 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1469,13 +1469,20 @@ flexflow_tensor_t * const flexflow_tensor_t input2_, float eps, int dim, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input1 = FFCObjectWrapper::unwrap(input1_); Tensor input2 = FFCObjectWrapper::unwrap(input2_); Tensor tensor_outputs[2]; - handle->residual_rms_norm( - input1, input2, tensor_outputs, eps, dim, input1->data_type, name); + handle->residual_rms_norm(input1, + input2, + tensor_outputs, + eps, + dim, + inplace_residual, + input1->data_type, + name); assert(tensor_outputs[0] != nullptr); assert(tensor_outputs[1] != nullptr); flexflow_tensor_t *tensor_outputs_wrapped = diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 7d0d829e51..bdb6d4d7a2 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -608,10 +608,6 @@ FutureMap FusedOp::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig - // so we transfer the maximum of them - // size_t batch_config_size = - // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 664c1ed13b..969c6458a4 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -31,6 +31,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, : OpMeta(handler, rms) { eps = rms->eps; + inplace_residual = rms->inplace_residual; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; num_elements = in_dim * batch_size; diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 264c12f004..cb511ef547 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -42,7 +42,8 @@ using namespace FlexFlow::Kernels::ResidualRMSNorm; bool operator==(ResidualRMSNormParams const &lhs, ResidualRMSNormParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps; + return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps && + lhs.dim == rhs.dim && lhs.inplace_residual == rhs.inplace_residual; } bool ResidualRMSNormParams::is_valid( @@ -55,6 +56,7 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; + params.inplace_residual = this->inplace_residual; if (this->name != nullptr) { strcpy(params.name, this->name); } @@ -66,6 +68,7 @@ void FFModel::residual_rms_norm(const Tensor input1, Tensor *outputs, float eps, int dim, + bool inplace_residual, DataType data_type, char const *name) { if (data_type == DT_NONE) { @@ -106,6 +109,7 @@ void FFModel::residual_rms_norm(const Tensor input1, rm->add_float_property("eps", eps); rm->add_int_property("dim", dim); + rm->add_int_property("inplace_residual", inplace_residual); layers.push_back(rm); outputs[0] = rm->outputs[0]; outputs[1] = rm->outputs[1]; @@ -120,6 +124,8 @@ Op *ResidualRMSNorm::create_operator_from_layer( long long value; layer->get_int_property("dim", value); int dim = value; + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; return new ResidualRMSNorm(model, layer->layer_guid, @@ -127,6 +133,7 @@ Op *ResidualRMSNorm::create_operator_from_layer( inputs[1], eps, dim, + inplace_residual, false, layer->name); } @@ -143,6 +150,7 @@ ResidualRMSNorm::ResidualRMSNorm( inputs.second, params.eps, params.dim, + params.inplace_residual, allocate_weights, params.name) {} @@ -157,6 +165,7 @@ ResidualRMSNorm::ResidualRMSNorm( inputs.second, other.eps, other.dim, + other.inplace_residual, allocate_weights, other.name) {} ResidualRMSNorm::ResidualRMSNorm(FFModel &model, @@ -165,6 +174,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, const ParallelTensor _input2, float _eps, int dim, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -177,6 +187,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, _input1, _input2) { eps = _eps; + inplace_residual = _inplace_residual; inputs[0] = _input1; inputs[1] = _input2; layer_guid = _layer_guid; @@ -237,13 +248,17 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, void ResidualRMSNorm::map_output_tensors(FFModel &ff) { assert(numOutputs == 2); assert(outputs[0]->get_volume() == inputs[0]->get_volume()); - outputs[0]->parallel_is = inputs[0]->parallel_is; - outputs[0]->region = inputs[0]->region; - outputs[0]->part = inputs[0]->part; - outputs[0]->region_grad = inputs[0]->region_grad; - outputs[0]->part_grad = inputs[0]->part_grad; - // map output 1 to new region - ff.map_tensor(outputs[1], this); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } } void ResidualRMSNorm::init(FFModel const &ff) { @@ -261,32 +276,44 @@ void ResidualRMSNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - assert(outputs[0]->part == inputs[0]->part); - assert(outputs[0]->region == inputs[0]->region); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[1]->region)); - launcher.add_field(1, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -314,32 +341,45 @@ void ResidualRMSNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); - assert(batch_outputs[0]->part == batch_inputs[0]->part); - assert(batch_outputs[0]->region == batch_inputs[0]->region); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -390,32 +430,45 @@ FutureMap 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - assert(batch_outputs[0]->part == batch_inputs[0]->part); - assert(batch_outputs[0]->region == batch_inputs[0]->region); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); return runtime->execute_index_space(ctx, launcher); } @@ -440,20 +493,68 @@ void ResidualRMSNorm::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - // residual_output is mapped to the same region as the input - GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + + GenericTensorAccessorW residual_output, output; + GenericTensorAccessorR weight; + if (m->inplace_residual) { + // residual_output is mapped to the same region as the input + residual_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + output = helperGetGenericTensorAccessorWO(m->output_type[1], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + weight = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + } else { + residual_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + output = helperGetGenericTensorAccessorWO(m->output_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + weight = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[4], + task->regions[4], + FID_DATA, + ctx, + runtime); + } + inference_kernel_wrapper( m, bc, input1, input2, weight, residual_output, output); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - ResidualRMSNorm::save_inference_tensors_to_file( - m, shard_id, bc, {input2}, {weight}, {residual_output, output}); + if (m->inplace_residual) { + ResidualRMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input2}, {weight}, {residual_output, output}); + } else { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input1, input2}, + {weight}, + {residual_output, output}); + } } } @@ -463,6 +564,7 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -483,6 +585,8 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + int inplace_residual; + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -491,6 +595,7 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + params.inplace_residual = inplace_residual; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } From 0d3aa7ecefea1b2aa2fb6e43b9a7ccf43c3811b4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 8 Feb 2024 05:18:01 +0000 Subject: [PATCH 148/198] fix inf fusion inplace --- include/flexflow/flexflow_c.h | 2 + include/flexflow/model.h | 2 + .../ops/add_bias_residual_layer_norm.h | 3 + .../ops/add_bias_residual_layer_norm_params.h | 1 + include/flexflow/ops/residual_layer_norm.h | 3 + .../flexflow/ops/residual_layer_norm_params.h | 1 + inference/models/falcon.cc | 2 + inference/models/mpt.cc | 3 + inference/models/opt.cc | 3 + inference/models/starcoder.cc | 3 + python/flexflow/core/flexflow_cffi.py | 8 + src/c/flexflow_c.cc | 10 +- src/ops/add_bias_residual_layer_norm.cc | 238 ++++++++++++------ src/ops/residual_layer_norm.cc | 138 +++++++--- src/ops/residual_layer_norm.cu | 1 + src/ops/residual_rms_norm.cc | 1 + src/runtime/substitution.cc | 1 + 17 files changed, 315 insertions(+), 105 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 6ce5876fa1..cd98c7f604 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -270,6 +270,7 @@ flexflow_tensor_t * bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name); flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( @@ -281,6 +282,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name); flexflow_tensor_t diff --git a/include/flexflow/model.h b/include/flexflow/model.h index ecad8034bc..33dcb079b2 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -579,6 +579,7 @@ class FFModel { bool elementwise_affine, float eps, bool use_bias = true, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a add_bias_residual_layer_norm layer @@ -589,6 +590,7 @@ class FFModel { bool elementwise_affine, float eps, bool use_bias = true, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a sigmoid_silu_multi layer diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index 550d56c47c..08b7404e14 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -24,6 +24,7 @@ class AddBiasResidualLayerNorm : public Op { bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name); void map_output_tensors(FFModel &ff) override; @@ -138,6 +139,7 @@ class AddBiasResidualLayerNorm : public Op { bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; std::vector axes; }; @@ -152,6 +154,7 @@ class AddBiasResidualLayerNormMeta : public OpMeta { bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; // PEFT related fields diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h index 87fe2fb562..840f521b01 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h @@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams { bool elementwise_affine; float eps; bool use_bias; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index d924132452..a028097905 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -26,6 +26,7 @@ class ResidualLayerNorm : public Op { bool _elementwise_affine, bool _use_bias, float _eps, + bool inplace_residual, bool allocate_weights, char const *name); void map_output_tensors(FFModel &ff) override; @@ -124,6 +125,7 @@ class ResidualLayerNorm : public Op { bool elementwise_affine, use_bias, use_two_residuals; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; std::vector axes; }; @@ -138,6 +140,7 @@ class ResidualLayerNormMeta : public OpMeta { bool elementwise_affine, use_bias, use_two_residuals; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; // PEFT related fields diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h index 949ae0c799..166d4b2b4e 100644 --- a/include/flexflow/ops/residual_layer_norm_params.h +++ b/include/flexflow/ops/residual_layer_norm_params.h @@ -13,6 +13,7 @@ struct ResidualLayerNormParams { float eps; bool use_bias; bool use_two_residuals; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid(std::tupledata_type, name); assert(tensor_outputs[0] != nullptr); @@ -679,7 +681,7 @@ flexflow_tensor_t * DEBUG_PRINT("[ResidualLayerNorm] input %p, residual1 %p, residual2 " "%p, output0: %p, " "output1: %p, use_two_residuals: %d, elementwise_affine %d, eps " - "%f, use_bias: %d, name %s", + "%f, use_bias: %d, inplace_residual: %d, name %s", input, residual1, residual2, @@ -689,6 +691,7 @@ flexflow_tensor_t * elementwise_affine, eps, use_bias, + inplace_residual, name); flexflow_tensor_t *tensor_outputs_wrapped = (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); @@ -706,6 +709,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); const Tensor input = FFCObjectWrapper::unwrap(input_); @@ -722,13 +726,14 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( elementwise_affine, eps, use_bias, + inplace_residual, input->data_type, name); assert(tensor_outputs[0] != nullptr); assert(tensor_outputs[1] != nullptr); DEBUG_PRINT("[AddBiasResidualLayerNorm] input %p, residual %p, output0: %p, " "output1: %p, elementwise_affine %d, eps " - "%f, use_bias %d, name %s", + "%f, use_bias %d, inplace_residual: %d, name %s", input, residual, tensor_outputs[0], @@ -736,6 +741,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( elementwise_affine, eps, use_bias, + inplace_residual, name); flexflow_tensor_t *tensor_outputs_wrapped = (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 6b71279971..bdf30a803a 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -43,7 +43,8 @@ bool operator==(AddBiasResidualLayerNormParams const &lhs, AddBiasResidualLayerNormParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && lhs.elementwise_affine == rhs.elementwise_affine && - lhs.use_bias == rhs.use_bias; + lhs.use_bias == rhs.use_bias && + lhs.inplace_residual == rhs.inplace_residual; } bool AddBiasResidualLayerNormParams::is_valid( @@ -58,6 +59,7 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; + params.inplace_residual = this->inplace_residual; if (this->name != nullptr) { strcpy(params.name, this->name); } @@ -71,6 +73,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of @@ -171,6 +174,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, ln->add_int_property("use_bias", use_bias); ln->add_int_vector_property("axes", axes); ln->add_float_property("eps", eps); + ln->add_int_property("inplace_residual", inplace_residual); layers.push_back(ln); outputs[0] = ln->outputs[0]; outputs[1] = ln->outputs[1]; @@ -189,6 +193,8 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer( layer->get_int_vector_property("axes", axes); float eps; layer->get_float_property("eps", eps); + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; return new AddBiasResidualLayerNorm(model, layer->layer_guid, inputs[0], @@ -197,6 +203,7 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer( elementwise_affine, use_bias, eps, + inplace_residual, false, // allocate_weights layer->name); } @@ -215,6 +222,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( params.elementwise_affine, params.use_bias, params.eps, + params.inplace_residual, allocate_weights, params.name) {} @@ -227,6 +235,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -239,7 +248,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( _input, _residual), elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), - use_bias(_use_bias) { + use_bias(_use_bias), inplace_residual(_inplace_residual) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -348,44 +357,57 @@ void AddBiasResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); - assert(batch_outputs[0]->part == batch_inputs[0]->part); - assert(batch_outputs[0]->region == batch_inputs[0]->region); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } // attn output // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -393,7 +415,7 @@ void AddBiasResidualLayerNorm::init_inference( READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -416,44 +438,56 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - assert(outputs[0]->part == inputs[0]->part); - assert(outputs[0]->region == inputs[0]->region); + if (inplace_residual) { + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); + } // input: attn output // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[1]->region)); - launcher.add_field(1, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -461,7 +495,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -535,37 +569,50 @@ FutureMap AddBiasResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - assert(batch_outputs[0]->part == batch_inputs[0]->part); - assert(batch_outputs[0]->region == batch_inputs[0]->region); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; // input // added_output: input + attn bias + residual - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); // attn bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(1, FID_DATA); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } // output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { // gamma launcher.add_region_requirement(RegionRequirement(weights[1]->part, @@ -573,7 +620,7 @@ FutureMap AddBiasResidualLayerNorm::inference( READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (use_bias) { // beta launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -581,7 +628,7 @@ FutureMap AddBiasResidualLayerNorm::inference( READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } return runtime->execute_index_space(ctx, launcher); @@ -590,13 +637,17 @@ FutureMap AddBiasResidualLayerNorm::inference( void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) { assert(numOutputs == 2); assert(outputs[0]->get_volume() == inputs[0]->get_volume()); - outputs[0]->parallel_is = inputs[0]->parallel_is; - outputs[0]->region = inputs[0]->region; - outputs[0]->part = inputs[0]->part; - outputs[0]->region_grad = inputs[0]->region_grad; - outputs[0]->part_grad = inputs[0]->part_grad; - // map output 1 to new region - ff.map_tensor(outputs[1], this); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } } /* @@ -625,29 +676,69 @@ void AddBiasResidualLayerNorm::inference_task( assert(regions.size() == 4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); - + int rid = 0, tid = 0, did = 0; + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(m->input_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR attn_bias = + helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR residual = + helperGetGenericTensorAccessorRO(m->input_type[1], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW added_output; + if (m->inplace_residual) { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(m->output_type[1], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); GenericTensorAccessorR gamma, beta; Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); Domain attn_bias_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); Domain residual_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); - Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); + Domain added_out_domain; + if (m->inplace_residual) { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + } else { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[did++].region.get_index_space()); + } Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); Domain gamma_domain, beta_domain; @@ -672,23 +763,23 @@ void AddBiasResidualLayerNorm::inference_task( if (m->elementwise_affine) { gamma = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[4], - task->regions[4], + regions[rid++], + task->regions[tid++], FID_DATA, ctx, runtime); gamma_domain = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); if (m->use_bias) { beta = helperGetGenericTensorAccessorRO(m->weight_type[2], - regions[5], - task->regions[5], + regions[rid++], + task->regions[tid++], FID_DATA, ctx, runtime); beta_domain = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); assert(gamma_domain == beta_domain); } @@ -1031,6 +1122,7 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -1047,6 +1139,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; float eps; + bool inplace_residual; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); @@ -1061,6 +1154,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -1072,6 +1166,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + params.inplace_residual = inplace_residual; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); @@ -1092,6 +1187,7 @@ size_t hash::operator()( } hash_combine(key, params.elementwise_affine); hash_combine(key, params.use_bias); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index dc302ce19c..9eea01cd81 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -44,7 +44,8 @@ bool operator==(ResidualLayerNormParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && lhs.elementwise_affine == rhs.elementwise_affine && lhs.use_bias == rhs.use_bias && - lhs.use_two_residuals == rhs.use_two_residuals; + lhs.use_two_residuals == rhs.use_two_residuals && + lhs.inplace_residual == rhs.inplace_residual; } bool ResidualLayerNormParams::is_valid( @@ -63,6 +64,7 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const { params.eps = this->eps; params.use_bias = this->use_bias; params.use_two_residuals = this->use_two_residuals; + params.inplace_residual = this->inplace_residual; if (this->name != nullptr) { strcpy(params.name, this->name); } @@ -78,6 +80,7 @@ void FFModel::residual_layer_norm(const Tensor input, bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of @@ -178,6 +181,7 @@ void FFModel::residual_layer_norm(const Tensor input, ln->add_int_vector_property("axes", axes); ln->add_float_property("eps", eps); ln->add_int_property("use_two_residuals", use_two_residuals); + ln->add_int_property("inplace_residual", inplace_residual); layers.push_back(ln); outputs[0] = ln->outputs[0]; outputs[1] = ln->outputs[1]; @@ -198,6 +202,9 @@ Op *ResidualLayerNorm::create_operator_from_layer( layer->get_float_property("eps", eps); layer->get_int_property("use_two_residuals", value); bool use_two_residuals = (bool)value; + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; + return new ResidualLayerNorm(model, layer->layer_guid, inputs[0], @@ -208,6 +215,7 @@ Op *ResidualLayerNorm::create_operator_from_layer( elementwise_affine, use_bias, eps, + inplace_residual, false, // allocate_weights layer->name); } @@ -229,6 +237,7 @@ ResidualLayerNorm::ResidualLayerNorm( params.elementwise_affine, params.use_bias, params.eps, + params.inplace_residual, allocate_weights, params.name) {} @@ -242,6 +251,7 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -255,7 +265,8 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, _residual1, _use_two_residuals ? _residual2 : nullptr), elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), - use_bias(_use_bias), use_two_residuals(_use_two_residuals) { + use_bias(_use_bias), use_two_residuals(_use_two_residuals), + inplace_residual(_inplace_residual) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -328,13 +339,17 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, void ResidualLayerNorm::map_output_tensors(FFModel &ff) { assert(numOutputs == 2); assert(outputs[0]->get_volume() == inputs[0]->get_volume()); - outputs[0]->parallel_is = inputs[0]->parallel_is; - outputs[0]->region = inputs[0]->region; - outputs[0]->part = inputs[0]->part; - outputs[0]->region_grad = inputs[0]->region_grad; - outputs[0]->part_grad = inputs[0]->part_grad; - // map output 1 to new region - ff.map_tensor(outputs[1], this); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } } void ResidualLayerNorm::init_inference( @@ -358,16 +373,19 @@ void ResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); - assert(batch_outputs[0]->part == batch_inputs[0]->part); - assert(batch_outputs[0]->region == batch_inputs[0]->region); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } int field_id = 0; // input // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, @@ -385,6 +403,15 @@ void ResidualLayerNorm::init_inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -434,11 +461,12 @@ void ResidualLayerNorm::init(FFModel const &ff) { int field_id = 0; // input // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(inputs[1]->part, @@ -456,6 +484,14 @@ void ResidualLayerNorm::init(FFModel const &ff) { inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, @@ -876,16 +912,19 @@ FutureMap ResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - assert(batch_outputs[0]->part == batch_inputs[0]->part); - assert(batch_outputs[0]->region == batch_inputs[0]->region); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } int field_id = 0; // input // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region)); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, @@ -903,6 +942,15 @@ FutureMap ResidualLayerNorm::inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -972,8 +1020,23 @@ void ResidualLayerNorm::inference_task( ctx, runtime); } - GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW added_output; + if (m->inplace_residual) { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + added_output = + helperGetGenericTensorAccessorWO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(m->output_type[1], regions[region_idx++], @@ -1011,8 +1074,14 @@ void ResidualLayerNorm::inference_task( assert(in_domain.get_volume() == residual2_domain.get_volume()); assert(residual2_domain == in_domain); } - Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); + Domain added_out_domain; + if (m->inplace_residual) { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + } else { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + } Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[task_region_idx++].region.get_index_space()); Domain gamma_domain, beta_domain; @@ -1091,6 +1160,7 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->eps); sez.serialize(this->use_bias); sez.serialize(this->use_two_residuals); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -1106,6 +1176,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; bool use_two_residuals; + bool inplace_residual; float eps; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); @@ -1122,6 +1193,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(eps); dez.deserialize(use_bias); dez.deserialize(use_two_residuals); + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -1139,6 +1211,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, params.eps = eps; params.use_bias = use_bias; params.use_two_residuals = use_two_residuals; + params.inplace_residual = inplace_residual; strcpy(params.name, name); if (use_two_residuals) { return ff.get_or_create_node( @@ -1165,6 +1238,7 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); hash_combine(key, params.use_bias); hash_combine(key, params.use_two_residuals); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 0ba462cde5..5e736cd6e8 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -36,6 +36,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, profiling = ln->profiling; inference_debugging = ln->inference_debugging; eps = ln->eps; + inplace_residual = ln->inplace_residual; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index cb511ef547..ff6729b925 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -840,6 +840,7 @@ size_t hash::operator()( hash_combine(key, params.eps); hash_combine(key, params.layer_guid.id); hash_combine(key, params.dim); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index e8b986582f..8c08c1cca0 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -3814,6 +3814,7 @@ bool FFModel::convert_graph_to_operators( abr_ln->elementwise_affine, abr_ln->use_bias, abr_ln->eps, + abr_ln->inplace_residual, true, NULL); break; From b658061c9a953d09a99ac24cf479c4966dfe1eef Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 9 Feb 2024 15:25:43 -0500 Subject: [PATCH 149/198] fix rest input grads for peft without inplace residuals --- src/runtime/inference_manager.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index c7f2b6d5a9..229d1785bf 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -613,6 +613,11 @@ void FFModel::compile_inference() { // We should not reset input grads since other operators have already // saved gradients into the region op->reset_input_grads[i] = false; + } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM || op->op_type == OP_RESIDUAL_RMS_NORM || op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) { + if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) { + reset_inputs.insert(op->inputs[0]->region); + op->reset_input_grads[0] = false; + } } else { reset_inputs.insert(op->inputs[i]->region); } From 3255fe4c260d72271d1f00d2a391c48f511c75fd Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 9 Feb 2024 22:00:35 +0000 Subject: [PATCH 150/198] fix --- src/ops/residual_rms_norm.cc | 4 ++-- src/runtime/inference_manager.cc | 4 +++- src/runtime/request_manager.cc | 9 +++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index ff6729b925..28fafcf224 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -482,13 +482,13 @@ void ResidualRMSNorm::inference_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(task->regions.size() == 4); - assert(regions.size() == 4); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; } ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); + assert(task->regions.size() == 5 - m->inplace_residual); + assert(regions.size() == 5 - m->inplace_residual); GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 229d1785bf..15d02edbbb 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -613,7 +613,9 @@ void FFModel::compile_inference() { // We should not reset input grads since other operators have already // saved gradients into the region op->reset_input_grads[i] = false; - } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM || op->op_type == OP_RESIDUAL_RMS_NORM || op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) { + } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM || + op->op_type == OP_RESIDUAL_RMS_NORM || + op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) { if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) { reset_inputs.insert(op->inputs[0]->region); op->reset_input_grads[0] = false; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 2eebc070d6..20496b7d84 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2077,6 +2077,15 @@ bool is_peft_operator_type(OperatorType type) { PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, LoraLinearConfig const mlp_second) { + if (!(mlp_first == LoraLinearConfig::DefaultConfig && + mlp_second == LoraLinearConfig::DefaultConfig)) { + if (!config.enable_peft) { + fprintf(stderr, + "Error: trying to register PEFT model, but peft mode is not " + "enabled.\n"); + assert(false); + } + } PEFTModelID peft_model_id(peft_model_global_guid++); InferenceManager *im = InferenceManager::get_inference_manager(); std::vector peft_operators; From ec2002e98a40bc7814ba38ba5dbc0ba87c9727e3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 15 Feb 2024 22:16:52 +0000 Subject: [PATCH 151/198] fix --- src/runtime/inference_manager.cc | 2 +- tests/peft/alignment/align_test_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 15d02edbbb..e480e74baa 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -617,9 +617,9 @@ void FFModel::compile_inference() { op->op_type == OP_RESIDUAL_RMS_NORM || op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) { if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) { - reset_inputs.insert(op->inputs[0]->region); op->reset_input_grads[0] = false; } + reset_inputs.insert(op->inputs[i]->region); } else { reset_inputs.insert(op->inputs[i]->region); } diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index b0cb5fe428..dbe7a0be40 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -1,8 +1,8 @@ import os, re, torch import numpy as np abs_dirname = os.path.dirname(os.path.abspath(__file__)) -hf_path = os.path.join(abs_dirname, "hf_peft_tensors") -ff_path = os.path.join(os.path.dirname(os.path.dirname(abs_dirname)), "build", "inference_tensors") +hf_path = os.path.join(os.path.dirname(abs_dirname), "hf_peft_tensors") +ff_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(abs_dirname))), "build", "inference_tensors") def print_unique_files_list(dirname): files_list = os.listdir(dirname) for f in sorted(files_list): From 098e88016fe8557da498ae876701f96df46ae966 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Feb 2024 02:48:11 +0000 Subject: [PATCH 152/198] fix residual rms --- .../ops/kernels/residual_rms_norm_kernels.h | 7 +- src/ops/fused.cu | 3 +- src/ops/kernels/residual_rms_norm_kernels.cu | 59 ++++---- src/ops/residual_rms_norm.cc | 138 ++++++++++++------ 4 files changed, 134 insertions(+), 73 deletions(-) diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 6eb5c0ae21..dfc9937cc3 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -66,9 +66,10 @@ void backward_kernel_wrapper( GenericTensorAccessorW const &weight_grad); void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, BatchConfig const *bc, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &residual_input0_grad, - GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, GenericTensorAccessorR const &weight); } // namespace ResidualRMSNorm } // namespace Kernels diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 55892ab7e9..c589f6a5be 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1026,9 +1026,10 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( m, bc, - my_output_grad_accessor[1], my_input_grad_accessor[0], my_input_grad_accessor[1], + my_output_grad_accessor[0], + my_output_grad_accessor[1], my_weight_accessor[0]); break; } diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 969c6458a4..4b92e70787 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -332,6 +332,7 @@ __global__ void ComputeInternalGradientsCUDAKernel( template __global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dX1_residual, T const *dY, T const *X, T const *gamma, @@ -351,7 +352,7 @@ __global__ void RMSNormBackwardCUDAKernel(int64_t N, if (reset_input_grad1) { dX1[index] = static_cast(dX_val); } else { - dX1[index] += static_cast(dX_val); + dX1[index] = dX1_residual[index] + static_cast(dX_val); } if (reset_input_grad2) { dX2[index] = static_cast(dX1[index]); @@ -399,6 +400,7 @@ void backward_kernel(ResidualRMSNormMeta const *m, RMSNormBackwardCUDAKernel<<>>( N, + nullptr, output_grad_ptr, residual_output_rms_input_ptr, weight_ptr, @@ -421,9 +423,10 @@ void backward_kernel(ResidualRMSNormMeta const *m, template void peft_bwd_kernel(ResidualRMSNormMeta const *m, BatchConfig const *bc, - T const *output_grad_ptr, - T *residual_input0_grad_ptr, - T *residual_input1_grad_ptr, + T const *output_grad_0_ptr, + T const *output_grad_1_ptr, + T *input_grad_0_ptr, + T *input_grad_1_ptr, T const *weight_ptr, cudaStream_t stream) { for (int i = 0; i < bc->max_requests_per_batch(); i++) { @@ -448,7 +451,7 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, ComputeInternalGradientsCUDAKernel <<>>( N, - output_grad_ptr, + output_grad_1_ptr, residual_output_rms_input_ptr, weight_ptr, static_cast(m->rms_ptr), @@ -457,13 +460,14 @@ void peft_bwd_kernel(ResidualRMSNormMeta const *m, RMSNormBackwardCUDAKernel <<>>( N, - output_grad_ptr, + output_grad_0_ptr, + output_grad_1_ptr, residual_output_rms_input_ptr, weight_ptr, static_cast(m->rms_ptr), static_cast(m->norm_ptr), - residual_input0_grad_ptr, - residual_input1_grad_ptr, + input_grad_0_ptr, + input_grad_1_ptr, m->reset_input_grads[0], m->reset_input_grads[1]); } @@ -532,17 +536,12 @@ void backward_kernel_wrapper( } } -/* - regions[0](I): RMS output_grad - regions[1](I/O): Residual input 0 grad - regions[2](I/O): Residual input 1 grad - regions[3](I): weight -*/ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, BatchConfig const *bc, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &residual_input0_grad, - GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, GenericTensorAccessorR const &weight) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -552,24 +551,28 @@ void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - assert(output_grad.data_type == residual_input0_grad.data_type); - assert(residual_input0_grad.data_type == residual_input1_grad.data_type); - assert(residual_input1_grad.data_type == weight.data_type); + assert(output_grad_1.data_type == input_grad_0.data_type); + assert(input_grad_0.data_type == input_grad_1.data_type); + assert(input_grad_1.data_type == weight.data_type); - if (output_grad.data_type == DT_HALF) { + if (output_grad_1.data_type == DT_HALF) { peft_bwd_kernel(m, bc, - output_grad.get_half_ptr(), - residual_input0_grad.get_half_ptr(), - residual_input1_grad.get_half_ptr(), + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_half_ptr(), + output_grad_1.get_half_ptr(), + input_grad_0.get_half_ptr(), + input_grad_1.get_half_ptr(), weight.get_half_ptr(), stream); - } else if (output_grad.data_type == DT_FLOAT) { + } else if (output_grad_1.data_type == DT_FLOAT) { peft_bwd_kernel(m, bc, - output_grad.get_float_ptr(), - residual_input0_grad.get_float_ptr(), - residual_input1_grad.get_float_ptr(), + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_float_ptr(), + output_grad_1.get_float_ptr(), + input_grad_0.get_float_ptr(), + input_grad_1.get_float_ptr(), weight.get_float_ptr(), stream); } else { diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index 28fafcf224..c0e517f5c4 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -732,37 +732,47 @@ Legion::FutureMap 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - // regions[0](I): RMS output_grad - launcher.add_region_requirement( - RegionRequirement(batch_outputs[1]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - batch_outputs[1]->region_grad)); - launcher.add_field(0, FID_DATA); - // regions[2](I/O): residual input grad 0 - launcher.add_region_requirement( - RegionRequirement(batch_inputs[0]->part_grad, - 0 /*projection id*/, - reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, - EXCLUSIVE, - batch_inputs[0]->region_grad)); - launcher.add_field(1, FID_DATA); - // regions[3](I/O): residual input grad 1 + int fid = 0; + // residual input grad 0 + launcher.add_region_requirement(RegionRequirement( + batch_inputs[0]->part_grad, + 0 /*projection id*/, + inplace_residual && !reset_input_grads[0] ? READ_WRITE : WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // residual input grad 1 launcher.add_region_requirement( RegionRequirement(batch_inputs[1]->part_grad, 0 /*projection id*/, reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, EXCLUSIVE, batch_inputs[1]->region_grad)); - launcher.add_field(2, FID_DATA); - // regions[4](I): gamma + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual && !reset_input_grads[0]) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + } + // RMS output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // gamma launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); return runtime->execute_index_space(ctx, launcher); } @@ -776,45 +786,91 @@ void ResidualRMSNorm::peft_bwd_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(task->regions.size() == 4); - assert(regions.size() == 4); ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); + int expected_regions = + (m->inplace_residual || m->reset_input_grads[0]) ? 4 : 5; + assert(task->regions.size() == expected_regions); + assert(regions.size() == expected_regions); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_active_peft_tokens() == 0) { return; } - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW residual_input0_grad = + + int rid = 0, t_rid = 0; + GenericTensorAccessorW input_grad_0 = helperGetGenericTensorAccessorRW(m->input_type[0], - regions[1], - task->regions[1], + regions[rid++], + task->regions[t_rid++], FID_DATA, ctx, runtime); - GenericTensorAccessorW residual_input1_grad = + GenericTensorAccessorW input_grad_1 = helperGetGenericTensorAccessorRW(m->input_type[0], - regions[2], - task->regions[2], + regions[rid++], + task->regions[t_rid++], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + + GenericTensorAccessorR output_grad_0; + if (!m->reset_input_grads[0]) { + if (m->inplace_residual) { + // mapped to input 0 + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + } + } + GenericTensorAccessorR output_grad_1 = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + peft_bwd_kernel_wrapper( - m, bc, output_grad, residual_input0_grad, residual_input1_grad, weight); + m, bc, output_grad_0, output_grad_1, input_grad_0, input_grad_1, weight); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - ResidualRMSNorm::save_inference_tensors_to_file( - m, - shard_id, - bc, - {residual_input0_grad, residual_input1_grad}, - {weight}, - {output_grad}, - false); + if (!m->reset_input_grads[0]) { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_0, output_grad_1}, + false); + } else { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_1}, + false); + } } } From 5688e16b374c6cd1b95433879ec68c9b002248d7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Feb 2024 05:02:10 +0000 Subject: [PATCH 153/198] fix --- src/ops/fused.cc | 2 +- src/ops/fused.cu | 43 ++++++++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index bdb6d4d7a2..4c934f8612 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -652,7 +652,7 @@ FutureMap FusedOp::inference(FFModel const &ff, offset += numOutputs; // add softmax output grad if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { - printf("operator %i is last SOFTMAX! adding output %i\n", + printf("operator %i is last SOFTMAX! adding grad for output %i\n", numOperators - 1, numOutputs - 1); assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index c589f6a5be..b89b6909cf 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -44,6 +44,7 @@ #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" +#include "flexflow/ffconst_utils.h" namespace FlexFlow { // declare Legion names @@ -161,6 +162,9 @@ __host__ void int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif // Domain my_id[MAX_NUM_INPUTS]; // Domain my_wd[MAX_NUM_WEIGHTS]; // Domain my_od[MAX_NUM_OUTPUTS]; @@ -172,9 +176,15 @@ __host__ void if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { // my_id[i] = input_domain[my_off]; my_input_accessor[i] = input_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = input_accessor[%i]\n", i, my_off); +#endif } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { // my_id[i] = output_domain[my_off]; my_input_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } else { assert(false); } @@ -191,6 +201,9 @@ __host__ void // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; my_output_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_output_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -439,13 +452,14 @@ __host__ void assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -668,22 +682,13 @@ __host__ void std::vector weight_accessors_to_save; std::vector output_accessors_to_save; for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - input_accessors_to_save.push_back(input_accessor[my_off]); - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - input_accessors_to_save.push_back(output_accessor[my_off]); - } else { - assert(false); - } + input_accessors_to_save.push_back(my_input_accessor[i]); } for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back( - weight_accessor[fused->op_weight_idx[i + woff]]); + weight_accessors_to_save.push_back(my_weight_accessor[i]); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); + output_accessors_to_save.push_back(my_output_accessor[i]); } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; From 9225e0c966cc5156ee6967c25be62c59bb1c2b4b Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 16 Feb 2024 05:39:37 +0000 Subject: [PATCH 154/198] fix --- src/ops/fused.cu | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index b89b6909cf..33b0aeca19 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -441,11 +441,12 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); + RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } case OP_RESIDUAL_RMS_NORM: { @@ -805,6 +806,9 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, } for (int op = fused->numOperators - 1; op >= 0; op--) { +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif ioff -= fused->op_num_inputs[op]; woff -= fused->op_num_weights[op]; ooff -= fused->op_num_outputs[op]; @@ -813,9 +817,15 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { // my_id[i] = input_domain[my_off]; my_input_grad_accessor[i] = input_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = input_grad_accessor[%i]\n", i, my_off); +#endif } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { // my_id[i] = output_domain[my_off]; my_input_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif } else { assert(false); } @@ -832,6 +842,9 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; my_output_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_output_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif } switch (fused->op_op_type[op]) { case OP_CONCAT: { From e12bff14f266d4b6ee1d868c3e883c76b916079a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 02:19:11 +0000 Subject: [PATCH 155/198] enable inf debugging in fusion bwd --- src/ops/fused.cu | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 33b0aeca19..965e08d6f9 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1195,6 +1195,29 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(false && "Fusion currently does not support type"); } } + if (metas->meta[op]->inference_debugging) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_grad_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_grad_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save, + false); + } } } From ed9afb7c0e1bff9f4966ff0afbe6c3b55e2e9cf5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 02:25:47 +0000 Subject: [PATCH 156/198] hack to silence warning in fused bwd --- src/ops/fused.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 4c934f8612..a81bf716bd 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -652,9 +652,9 @@ FutureMap FusedOp::inference(FFModel const &ff, offset += numOutputs; // add softmax output grad if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { - printf("operator %i is last SOFTMAX! adding grad for output %i\n", - numOperators - 1, - numOutputs - 1); + // printf("operator %i is last SOFTMAX! adding grad for output %i\n", + // numOperators - 1, + // numOutputs - 1); assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); launcher.add_region_requirement( RegionRequirement(batch_outputs[numOutputs - 1]->part_grad, @@ -700,7 +700,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_inputs[i]->part_grad, 0 /*projection id*/, - READ_WRITE, + WRITE_ONLY, EXCLUSIVE, batch_inputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); @@ -721,7 +721,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_outputs[i]->part_grad, 0 /*projection id*/, - READ_WRITE, + i == numOutputs -1 ? READ_WRITE : WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); From 96d0e9b00fc1e33ec34e682f8b231b098f52bffc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 02:43:25 +0000 Subject: [PATCH 157/198] fix --- src/ops/fused.cc | 2 +- src/ops/fused.cu | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index a81bf716bd..d5f1ace86d 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -721,7 +721,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, launcher.add_region_requirement( RegionRequirement(batch_outputs[i]->part_grad, 0 /*projection id*/, - i == numOutputs -1 ? READ_WRITE : WRITE_ONLY, + i == numOutputs - 1 ? READ_WRITE : WRITE_ONLY, EXCLUSIVE, batch_outputs[i]->region_grad)); launcher.add_field(offset + i, FID_DATA); diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 965e08d6f9..99d9e3410f 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -14,6 +14,7 @@ */ #include "flexflow/accessor.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" @@ -44,7 +45,6 @@ #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" -#include "flexflow/ffconst_utils.h" namespace FlexFlow { // declare Legion names @@ -444,9 +444,9 @@ __host__ void RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; Kernels::RMSNorm::inference_kernel_wrapper(m, bc, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } case OP_RESIDUAL_RMS_NORM: { @@ -454,13 +454,14 @@ __host__ void assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 2); ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::inference_kernel_wrapper(m, - bc, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + Kernels::ResidualRMSNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } case OP_INC_MULTIHEAD_SELF_ATTENTION: { @@ -678,7 +679,11 @@ __host__ void assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging) { + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; std::vector output_accessors_to_save; @@ -1195,7 +1200,11 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging) { + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; std::vector output_accessors_to_save; From 2cbc0b717bd5063627595059ece7c357f74cba23 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 04:31:05 +0000 Subject: [PATCH 158/198] fix --- src/runtime/model.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 603e87a937..10ce05ca1e 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3798,9 +3798,16 @@ bool FFModel::check_operators_integrity( } for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(FusedOp::use_same_regions( - fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT || + (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && + (old_op->op_type == OP_RESIDUAL_LAYERNORM || + old_op->op_type == OP_RESIDUAL_RMS_NORM || + old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); + if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) { + assert(FusedOp::use_same_regions(fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + } else { + assert(FusedOp::use_same_regions(fused->inputs[my_off], old_op->outputs[i], pt_mapping)); + } } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; From 36cb2b39d1ff573e2b8f60dcc81deb1b4a4378f0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 05:39:13 +0000 Subject: [PATCH 159/198] fix build --- inference/incr_decoding/incr_decoding.cc | 2 +- src/c/flexflow_c.cc | 2 +- src/ops/arg_topk.cc | 5 +-- src/ops/inc_multihead_self_attention.cu | 1 + src/ops/sigmoid_silu_multi.cc | 4 --- src/runtime/inference_manager.cc | 4 +-- src/runtime/model.cc | 14 +++++---- src/runtime/request_manager.cc | 39 ++++++++++++++---------- 8 files changed, 39 insertions(+), 32 deletions(-) diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 7f2ea21148..d376c3e39c 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -270,7 +270,7 @@ void FlexFlow::top_level_task(Task const *task, : model.register_peft_model( LoraLinearConfig::DefaultConfig /*mlp_first*/, mlp_second /*mlp_second*/); - + // Start background server rm->start_background_server(&model); diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index a9ba9158ee..58acf3d010 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1616,7 +1616,7 @@ void flexflow_model_generate(flexflow_model_t handle_, text_str.c_str(), max_seq_length); } - + std::vector results = handle->generate(requests); // If the prompt exceeds max seq len, check that we return the prompt with no diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 53332791c4..53b259a703 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -431,9 +431,10 @@ BeamInferenceResult ArgTopK::inference_speculative_task( ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); BeamInferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); - download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + copy_tensor_dev_to_host( + probs.get_float_ptr(), ir.probs, batch_size * m->k); return ir; } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 92bafaead3..83fdbaf927 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1644,6 +1644,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, // Copy C_softmax to m->softmax_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); MemoryAllocator *allocator = m->handle.peft_activation_allocator; m->softmax_activation_buffer = allocator->allocate_instance_untyped( sizeof(DT) * total_tokens * num_new_tokens * m->num_q_heads); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index e87bd16699..98cd662efd 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -570,10 +570,6 @@ Node SigmoidSiluMulti::deserialize(FFModel &ff, dez.deserialize(name_len); dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); - size_t name_len; - char name[MAX_OPNAME] = {0}; - dez.deserialize(name_len); - dez.deserialize(name, name_len); SigmoidSiluMultiParams params; params.layer_guid = layer_guid; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 34c807dee4..91a6dab9b5 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -236,8 +236,8 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // Check whether we need to reset input grads // We use a parallel tensor's region as the key std::set reset_inputs; - for (int l = operators.size() - 1; l >= 0; l--) { - Op *op = operators[l]; + for (int l = model->operators.size() - 1; l >= 0; l--) { + Op *op = model->operators[l]; for (int i = 0; i < op->numInputs; i++) { assert(op->inputs[i]->region != LogicalRegion::NO_REGION); if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 10ce05ca1e..a64fb8ec9c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3799,14 +3799,16 @@ bool FFModel::check_operators_integrity( for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT || - (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && - (old_op->op_type == OP_RESIDUAL_LAYERNORM || - old_op->op_type == OP_RESIDUAL_RMS_NORM || - old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); + (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && + (old_op->op_type == OP_RESIDUAL_LAYERNORM || + old_op->op_type == OP_RESIDUAL_RMS_NORM || + old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) { - assert(FusedOp::use_same_regions(fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->outputs[i], pt_mapping)); } else { - assert(FusedOp::use_same_regions(fused->inputs[my_off], old_op->outputs[i], pt_mapping)); + assert(FusedOp::use_same_regions( + fused->inputs[my_off], old_op->outputs[i], pt_mapping)); } } ioff += fused->op_num_inputs[op]; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7bc1966abf..41c371d4e2 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -435,12 +435,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } else { int processed_tokens = - old_bc.requestsInfo[i].first_token_depth_in_request + - old_bc.requestsInfo[i].num_tokens_in_batch; + old_bc.requestsInfo[i].first_token_depth_in_request + + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); bool request_completed = false; // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + if (request.tokens.size() >= + old_bc.requestsInfo[i].max_sequence_length) { request_completed = true; } else if (request.tokens.back() == eos_token_id) { // Encounter EOS token id @@ -469,7 +470,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, log_req_mgr.print("Final output: %s", output.c_str()); num_processed_requests++; ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + profile_info.finish_time = + Realm::Clock::current_time_in_microseconds(); total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; @@ -486,10 +488,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; + << std::setprecision(3) << total_request_run_time + << std::endl; outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; + << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -509,11 +511,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } else { new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; - new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[i].first_token_depth_in_request = + processed_tokens; + new_bc.requestsInfo[i].first_token_offset_in_batch = + new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; @@ -527,10 +532,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase - new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, - (int)request.tokens.size() - - new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].num_tokens_in_batch = std::min( + get_max_tokens_per_batch() - new_bc.num_tokens, + (int)request.tokens.size() - + new_bc.requestsInfo[i].first_token_depth_in_request); new_bc.requestsInfo[i].prompt_phase = true; } for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { @@ -538,7 +543,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.tokensInfo[new_bc.num_tokens].request_index = i; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; assert(depth < request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.tokens[depth]; new_bc.num_tokens++; } // Update profiling @@ -2399,7 +2405,8 @@ std::vector> return merged_tree; } -std::vector FFModel::generate(std::vector const &requests) { +std::vector + FFModel::generate(std::vector const &requests) { RequestManager *rm = RequestManager::get_request_manager(); std::vector guids; for (int i = 0; i < requests.size(); i++) { From 21b77f11c3cacb06c294bdb17a2b3be52e8fdb83 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 18:47:01 +0000 Subject: [PATCH 160/198] fix --- src/ops/noop.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ops/noop.cc b/src/ops/noop.cc index dabdf835dd..45bd76d59d 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -90,9 +90,10 @@ OpMeta *NoOp::init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + NoOp *no_op = (NoOp *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - // OpMeta *m = new OpMeta(handle); - return nullptr; + OpMeta *m = new OpMeta(handle, no_op); + return m; } void NoOp::init_inference(FFModel const &ff, @@ -167,7 +168,7 @@ void NoOp::init_inference(FFModel const &ff, set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -244,7 +245,7 @@ void NoOp::init(FFModel const &ff) { set_argumentmap_for_init(ff, argmap); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, From 9075d3fb7ea3ef893c46f554551f681a109d8f90 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 19 Feb 2024 20:29:41 +0000 Subject: [PATCH 161/198] fix --- python/flexflow/core/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 2614518acf..522dbe7e44 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -88,7 +88,7 @@ "offload_reserve_space_size": "-offload-reserve-space-size", "use_4bit_quantization": "--4bit-quantization", "use_8bit_quantization": "--8bit-quantization", - "enable_peft": "", + "enable_peft": "-enable-peft", "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size", "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size", } From 0b35b0c16ee2bec35be2acb4c59e9e7801292b4e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 22 Mar 2024 20:19:50 +0000 Subject: [PATCH 162/198] add draft peft test --- tests/peft_test.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 tests/peft_test.sh diff --git a/tests/peft_test.sh b/tests/peft_test.sh new file mode 100755 index 0000000000..29b3e6520c --- /dev/null +++ b/tests/peft_test.sh @@ -0,0 +1,28 @@ +#! /usr/bin/env bash +set -x +set -e + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}" + +# Token to access private huggingface models (e.g. LLAMA-2) +HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none} +if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then + huggingface-cli login --token "$HUGGINGFACE_TOKEN" +fi + +# Create test prompt file +mkdir -p ../inference/prompt +echo '["Two things are infinite: "]' > ../inference/prompt/peft.json + +# Create output folder +mkdir -p ../inference/output + +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 + +# Download test model +python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m +# if first time, add: --refresh-cache + +./inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft From b6ada2f9b9df6ce00c0c2a2b00d6bf3ac81dea2d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 27 Mar 2024 14:47:31 -0400 Subject: [PATCH 163/198] Peft python interface (#1306) * update script * less model renaming * fix * fix * fix * backup * . * update * . * fixes * fix * fix build * fix * fix * fix issues for downloading peft model * solved issues for download peft model * added printouts for debugging * fix * fix seg fault * add test, separate peft script in cpp * fix * fixes * fix * update peft python interface * update * update * update * updates * fix * fixes * fix * fixes --------- Co-authored-by: april-yyt --- CMakeLists.txt | 1 + include/flexflow/ffconst.h | 8 +- include/flexflow/flexflow_c.h | 36 +- include/flexflow/model.h | 15 +- include/flexflow/ops/lora_linear.h | 26 +- include/flexflow/ops/lora_linear_params.h | 4 +- include/flexflow/request_manager.h | 4 +- inference/incr_decoding/incr_decoding.cc | 43 +- inference/models/falcon.cc | 34 +- inference/models/llama.cc | 110 +- inference/models/mpt.cc | 35 +- inference/models/opt.cc | 43 +- inference/models/starcoder.cc | 26 +- inference/peft/CMakeLists.txt | 38 + inference/peft/Makefile | 37 + inference/peft/peft.cc | 348 ++ inference/python/ff_peft.py | 148 + inference/utils/download_peft_model.py | 28 +- python/flexflow/core/flexflow_cffi.py | 4819 +++++++++++---------- python/flexflow/serve/__init__.py | 15 +- python/flexflow/serve/models/falcon.py | 35 +- python/flexflow/serve/models/llama.py | 43 +- python/flexflow/serve/models/mpt.py | 37 +- python/flexflow/serve/models/opt.py | 40 +- python/flexflow/serve/models/starcoder.py | 47 +- python/flexflow/serve/serve.py | 481 +- python/flexflow/type.py | 3 + src/c/flexflow_c.cc | 145 +- src/ops/fused.cu | 6 +- src/ops/inc_multihead_self_attention.cu | 3 +- src/ops/lora_linear.cc | 479 +- src/ops/lora_linear_params.cc | 28 +- src/runtime/ffconst_utils.cc | 6 +- src/runtime/file_loader.cc | 90 +- src/runtime/graph.cc | 3 +- src/runtime/inference_manager.cc | 44 + src/runtime/model.cc | 19 +- src/runtime/request_manager.cc | 215 +- tests/peft/hf_serve.py | 70 +- tests/peft_test.sh | 6 +- 40 files changed, 4228 insertions(+), 3390 deletions(-) create mode 100644 inference/peft/CMakeLists.txt create mode 100644 inference/peft/Makefile create mode 100644 inference/peft/peft.cc create mode 100644 inference/python/ff_peft.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f7044..22770b6c28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -558,6 +558,7 @@ if(NOT BUILD_LEGION_ONLY) if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(inference/spec_infer) add_subdirectory(inference/incr_decoding) + add_subdirectory(inference/peft) endif() diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index fb12adf2d3..016dd7bdd1 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -78,6 +78,11 @@ enum InferenceMode { TREE_VERIFY_MODE = 2003, }; +enum RequestType { + REQ_INFERENCE = 4001, + REQ_FINETUNING = 4002, +}; + // This is consistent with TASO's OpType // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138 enum OperatorType { @@ -179,8 +184,7 @@ enum OperatorType { OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, // PEFT Ops - OP_LORA_MLP_FIRST, - OP_LORA_MLP_SECOND, + OP_LORA, // Parallel Ops OP_REPARTITION, OP_COMBINE, diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index b7b20f2d2f..004523e875 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -55,6 +55,8 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t); FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t); +FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t); // ----------------------------------------------------------------------- // FFConfig @@ -593,6 +595,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, bool beam_search, char const *name); +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); @@ -616,10 +621,13 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); void flexflow_model_generate(flexflow_model_t handle_, int num_requests, - char const **input_text, - int max_num_chars, - char **output_text, - int max_seq_length, + enum RequestType *request_types, + char const **input_texts, + char **output_texts, + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps, int **output_length_and_tokens); void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); @@ -1036,6 +1044,26 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_); +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_); + +void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_); + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create(); + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id); + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_); + #ifdef __cplusplus } #endif diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 34ace0c5dc..099e2209e4 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -837,19 +837,12 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== - void lora_linear(Tensor const input, - Tensor const output, - OperatorType _type, - char const *name = nullptr); + PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); // ======================================== // Inference APIs // ======================================== std::vector generate(std::vector const &requests); - PEFTModelID register_peft_model( - LoraLinearConfig const mlp_first = LoraLinearConfig::DefaultConfig, - LoraLinearConfig const mlp_second = LoraLinearConfig::DefaultConfig); - Tensor create_tensor_legion_ordering(int num_dim, int const dims[], DataType data_type, @@ -1174,6 +1167,12 @@ class FFModel { std::vector layers; std::vector operators; std::vector parameters; + // PEFT related + std::unordered_map base_layer_to_peft_layer; + std::unordered_map> peft_layer_to_peft_id; + std::unordered_map peft_configs; + // std::vector peft_operators; + FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; // Cached operators: key: operator hash, value: operator pointer diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index b9aabdd1aa..9e83c3f90e 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -17,12 +17,14 @@ class LoraLinear : public Op { using Params = LoraLinearParams; using Input = std::pair; - LoraLinear(FFModel &model, - LayerID const &layer_guid, - OperatorType type, - ParallelTensor const input, - ParallelTensor const output, - char const *name = nullptr); + LoraLinear( + FFModel &model, + LayerID const &layer_guid, + OperatorType type, + ParallelTensor const input, + ParallelTensor const output, + std::unordered_map const &_peft_configs, + char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, @@ -39,11 +41,6 @@ class LoraLinear : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; - void register_peft_model(FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - PEFTModelID const &model_id, - LoraLinearConfig const lora_config); Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, @@ -64,11 +61,6 @@ class LoraLinear : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void - register_model_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); static void inference_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -98,6 +90,8 @@ class LoraLinear : public Op { int num_inputs) const override; // size_t get_params_hash() const override; LoraLinearParams get_params() const; + + std::unordered_map peft_configs; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index e82243fd67..ff041334f1 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -12,7 +12,7 @@ namespace FlexFlow { class LoraLinearConfig { public: - static const LoraLinearConfig DefaultConfig; + static const LoraLinearConfig EmptyConfig; LoraLinearConfig(); LoraLinearConfig(int rank, OptimizerType type = OPTIMIZER_TYPE_SGD, @@ -33,6 +33,7 @@ class LoraLinearConfig { std::string peft_model_id; int lora_alpha; float lora_dropout; + std::vector target_modules; // whether to load weights from file, instead of initializing them randomly bool load_weights_from_file; }; @@ -41,6 +42,7 @@ class LoraLinearParams { public: LayerID layer_guid; OperatorType type; + std::unordered_map peft_configs; char name[MAX_OPNAME]; bool is_valid(std::pair const diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 0e59888888..bf6e475cbb 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -65,7 +65,6 @@ struct Request { COMPLETED = 103, // finished and verified FINISHING = 104, // finishing request, but not yet verified }; - enum RequestType { REQ_INFERENCE = 201, REQ_FINETUNING = 202 }; BatchConfig::RequestGuid guid; PEFTModelID peft_model_id = PEFTModelID::NO_ID; int max_sequence_length = 128; @@ -81,10 +80,11 @@ struct Request { RequestType req_type = REQ_INFERENCE; int completed_training_steps = 0; int max_training_steps = 1; - std::vector> dataset_text; + std::string dataset_filepath; std::vector, std::vector>> dataset; + friend std::ostream &operator<<(std::ostream &os, Request const &req); }; // store the result of beam search diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index d376c3e39c..c3993b1ad4 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -40,7 +40,6 @@ void parse_input_args(char **argv, int argc, FilePaths &paths, std::string &llm_model_name, - std::string &peft_model_name, bool &use_full_precision, bool &verbose, bool &do_sample, @@ -58,13 +57,6 @@ void parse_input_args(char **argv, } continue; } - if (!strcmp(argv[i], "-peft-model")) { - peft_model_name = std::string(argv[++i]); - for (char &c : peft_model_name) { - c = std::tolower(c); - } - continue; - } // cache folder if (!strcmp(argv[i], "-cache-folder")) { paths.cache_folder_path = std::string(argv[++i]); @@ -133,7 +125,7 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "Doesn't support quantization in non-offload mode"); } FilePaths file_paths; - std::string llm_model_name, peft_model_name; + std::string llm_model_name; bool use_full_precision = false; bool verbose = false; bool do_sample = false; @@ -150,7 +142,6 @@ void FlexFlow::top_level_task(Task const *task, argc, file_paths, llm_model_name, - peft_model_name, use_full_precision, verbose, do_sample, @@ -159,6 +150,7 @@ void FlexFlow::top_level_task(Task const *task, max_requests_per_batch, max_tokens_per_batch, max_sequence_length); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -259,19 +251,6 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } - // Register PEFT layer - LoraLinearConfig mlp_second = - peft_model_name.empty() - ? LoraLinearConfig::DefaultConfig - : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); - PEFTModelID peft_model_id = - peft_model_name.empty() - ? PEFTModelID::NO_ID - : model.register_peft_model( - LoraLinearConfig::DefaultConfig /*mlp_first*/, - mlp_second /*mlp_second*/); - - // Start background server rm->start_background_server(&model); int total_num_requests = 0; @@ -288,20 +267,10 @@ void FlexFlow::top_level_task(Task const *task, for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - // Add inference request - // Request inference_req; - // inference_req.prompt = text; - // inference_req.max_sequence_length = 128; - // inference_req.peft_model_id = peft_model_id; - // requests.push_back(inference_req); - // total_num_requests++; - // Add fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = Request::RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 128; - fine_tuning_req.peft_model_id = peft_model_id; - fine_tuning_req.dataset_text.push_back(std::make_pair(text, "")); - requests.push_back(fine_tuning_req); + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + requests.push_back(inference_req); total_num_requests++; } std::vector result = model.generate(requests); diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index f86130ff2b..195d6ba7e3 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_layer_norm( @@ -91,7 +91,7 @@ void FALCON::create_falcon_model(FFModel &ff, true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = res_ln_outputs[0]; att_norm = res_ln_outputs[1]; @@ -117,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -142,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -167,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -188,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h") + std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h") .c_str()); dense_h_to_4h = ff.gelu(dense_h_to_4h); @@ -204,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h") + std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h") .c_str()); } // final normalization and linear @@ -254,26 +254,6 @@ void FALCON::create_falcon_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - falcon_config.n_head, - falcon_config.n_head_kv, - falcon_config.hidden_size, - falcon_config.hidden_size / falcon_config.n_head, - ff.config.tensor_parallelism_degree); - std::cout << "------load weights ----------" << std::endl; - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 0db7796567..4be232e81b 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "tok_embeddings"); + "embed_tokens"); Tensor w2 = nullptr; @@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.rms_norm_eps, llama_config.hidden_size, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_rms_norm( @@ -86,7 +86,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size, false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = token_att_norm[0]; att_norm = token_att_norm[1]; @@ -112,7 +112,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -135,7 +135,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -158,7 +158,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -178,60 +178,54 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.hidden_size, false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str()); + std::string("layers." + std::to_string(i) + ".post_attention_layernorm") + .c_str()); token = token_ff_norm[0]; Tensor ff_norm = token_ff_norm[1]; - Tensor w1 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w1") - .c_str()); + Tensor w1 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str()); - Tensor w3 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w3") - .c_str()); + Tensor w3 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str()); Tensor multi = ff.sigmoid_silu_multi(w1, w3); - w2 = - ff.dense(multi, - llama_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w2") - .c_str()); - // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear( + w2 = ff.dense( multi, - w2, - OP_LORA_MLP_SECOND, - std::string("layers_" + std::to_string(i) + "_feed_forward_w2_lora") - .c_str()); + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); + // Low-Rank Adapter (LoRA) for the second linear layer + // ff.lora_linear(std::string("down_proj"), std::string("layers." + + // std::to_string(i) + ".mlp.down_proj.lora").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; @@ -254,7 +248,7 @@ void LLAMA::create_llama_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "output"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -288,16 +282,6 @@ void LLAMA::create_llama_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 95179691a1..e4a7e0056d 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor intermediate_output = nullptr, layernorm_output = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); } else { ff.residual_layer_norm( intermediate_output, @@ -88,7 +88,7 @@ void MPT::create_mpt_model(FFModel &ff, false, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; } @@ -114,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -138,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -162,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -184,7 +184,7 @@ void MPT::create_mpt_model(FFModel &ff, false, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_2").c_str()); + std::string("layers." + std::to_string(i) + ".norm_2").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; @@ -200,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str()); layernorm_output = ff.gelu(layernorm_output); intermediate_output = ff.dense( layernorm_output, @@ -213,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str()); } // final @@ -228,7 +228,7 @@ void MPT::create_mpt_model(FFModel &ff, false, false, DT_NONE, - "transformer_norm_f"); + "norm_f"); Tensor all_final_norm = res_ln_outputs[1]; Tensor lm_head = ff.dense(all_final_norm, @@ -262,21 +262,6 @@ void MPT::create_mpt_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - mpt_config.n_heads, - mpt_config.n_heads, - mpt_config.hidden_size, - mpt_config.hidden_size / mpt_config.n_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 7d2abad829..b3f2ef4e17 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -96,7 +96,7 @@ void OPT::create_opt_model(FFModel &ff, true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_layer_norm") + std::string("layers." + std::to_string(i) + ".self_attn_layer_norm") .c_str()); Tensor residual = res_ln_outputs[0]; Tensor hidden_states = res_ln_outputs[1]; @@ -122,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -146,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -170,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -189,8 +189,8 @@ void OPT::create_opt_model(FFModel &ff, true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + - "_add_bias_residual_layer_norm") + std::string("layers." + std::to_string(i) + + ".add_bias_residual_layer_norm") .c_str()); added = res_ln_outputs[0]; Tensor final_norm = res_ln_outputs[1]; @@ -207,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc1").c_str()); + std::string("layers." + std::to_string(i) + ".fc1").c_str()); fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, @@ -218,13 +218,10 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc2").c_str()); + std::string("layers." + std::to_string(i) + ".fc2").c_str()); // Low-Rank Adapter (LoRA) for the second linear layer - ff.lora_linear( - fc1, - fc2, - OP_LORA_MLP_SECOND, - std::string("layers_" + std::to_string(i) + "_fc2_lora").c_str()); + // ff.lora_linear(std::string("fc2"), std::string("layers." + + // std::to_string(i) + ".fc2.lora").c_str()); } // final @@ -252,7 +249,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "embed_tokens_weight_lm_head"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -276,24 +273,6 @@ void OPT::create_opt_model(FFModel &ff, use_full_precision); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - opt_config.num_attention_heads, - opt_config.num_attention_heads, - opt_config.hidden_size, - opt_config.hidden_size / - opt_config.num_attention_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------finished loading weights----------" << std::endl; - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index fb6269ad75..cd8bf3a9a7 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor positional_embedding = ff.embedding(position_input, @@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wpe"); + "wpe"); Tensor residual = nullptr, c_proj = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -98,7 +98,7 @@ void STARCODER::create_starcoder_model( true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_1").c_str()); + std::string("layers." + std::to_string(i) + ".ln_1").c_str()); Tensor hidden_states = res_ln_outputs[0]; Tensor ln_1 = res_ln_outputs[1]; @@ -125,7 +125,7 @@ void STARCODER::create_starcoder_model( 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn.c_attn") .c_str() /*name*/ ); break; @@ -147,7 +147,7 @@ void STARCODER::create_starcoder_model( true, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_2").c_str()); + std::string("layers." + std::to_string(i) + ".ln_2").c_str()); residual = res_ln_outputs[0]; Tensor l2_norm = res_ln_outputs[1]; @@ -163,7 +163,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str()); c_fc = ff.gelu(c_fc); @@ -178,7 +178,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str()); } // final normalization and linear ff.residual_layer_norm(residual, @@ -192,7 +192,7 @@ void STARCODER::create_starcoder_model( true, false, DT_NONE, - "transformer_ln_f"); + "ln_f"); Tensor ln_f = res_ln_outputs[1]; Tensor lm_head = ff.dense(ln_f, @@ -235,16 +235,6 @@ void STARCODER::create_starcoder_model( ff.config.tensor_parallelism_degree, use_full_precision); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt new file mode 100644 index 0000000000..4547907176 --- /dev/null +++ b/inference/peft/CMakeLists.txt @@ -0,0 +1,38 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlow_Peft) +set(project_target peft) + + +set(CPU_SRC + ${FLEXFLOW_CPP_DRV_SRC} + peft.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target} ${CPU_SRC}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target} ${CPU_SRC}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) + +set(BIN_DEST "bin") +install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) diff --git a/inference/peft/Makefile b/inference/peft/Makefile new file mode 100644 index 0000000000..0e4b79f51f --- /dev/null +++ b/inference/peft/Makefile @@ -0,0 +1,37 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= llama_pipeline +# List all the application source files here +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc new file mode 100644 index 0000000000..eade2eaeeb --- /dev/null +++ b/inference/peft/peft.cc @@ -0,0 +1,348 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +LegionRuntime::Logger::Category log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + paths.cache_folder_path = "~/.cache/flexflow"; + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + // Start background server + rm->start_background_server(&model); + + int total_num_requests = 0; + { + std::vector requests; + + // Add inference requests + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + // for (auto &prompt : prompt_json) { + // std::string text = prompt.get(); + // printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // Request inference_req; + // inference_req.prompt = text; + // inference_req.max_sequence_length = 128; + // inference_req.peft_model_id = peft_model_id; + // requests.push_back(inference_req); + // total_num_requests++; + // } + + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = 128; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.dataset_filepath = file_paths.prompt_file_path; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + total_num_requests++; + + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py new file mode 100644 index 0000000000..38a25fb614 --- /dev/null +++ b/inference/python/ff_peft.py @@ -0,0 +1,148 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 1, + "memory_per_gpu": 8192, + "zero_copy_memory_per_node": 12000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": True, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "inference_debugging": True, + "fusion": True, + } + model_configs = { + # required parameters + "base_model": "JackFram/llama-160m", + "peft_model_ids": [ + "goliaro/llama-160m-lora-full", + ], + # optional parameters + "cache_path": "", + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "finetuning_dataset": os.path.join( + os.path.dirname(os.path.abspath(__file__)), "../prompt/peft.json" + ), + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(model_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + for peft_model_id in configs.peft_model_ids: + llm.add_peft(peft_model_id) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + + llm.start_server() + + requests = [] + # Serving + if len(configs.prompt) > 0: + prompts = [s for s in json.load(open(configs.prompt))] + inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, prompt=prompt, max_sequence_length=128 + ) + for prompt in prompts + ] + requests += inference_requests + # Finetuning + if len(configs.finetuning_dataset) > 0: + for peft_model_id in configs.peft_model_ids: + finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, + max_sequence_length=128, + peft_model_id=llm.get_ff_peft_id(peft_model_id), + dataset_filepath=configs.finetuning_dataset, + ) + requests.append(finetuning_request) + + llm.generate(requests) + + llm.stop_server() + + +if __name__ == "__main__": + print("flexflow PEFT example") + main() diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index 5c7704b6f0..ad79816f84 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -6,7 +6,10 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( - "peft_model_ids", type=str, nargs="+", help="Name of the model(s) to download" + "--base_model_name", type=str, help="Name of the model to download" + ) + parser.add_argument( + "peft_model_ids", type=str, nargs="+", help="Name of the PEFT model(s) to download" ) parser.add_argument( "--cache-folder", @@ -42,16 +45,19 @@ def main(args): else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) - for peft_model_id in args.peft_model_ids: - for data_type in data_types: - peft = ff.PEFT( - peft_model_id, - data_type=data_type, - cache_path=args.cache_folder, - refresh_cache=args.refresh_cache, - ) - peft.download_hf_weights_if_needed() - peft.download_hf_config() + + for data_type in data_types: + llm = ff.LLM( + args.base_model_name, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + for peft_model_id in args.peft_model_ids: + llm.add_peft(peft_model_id) + llm.download_hf_weights_if_needed() + llm.download_hf_config() + llm.download_hf_tokenizer_if_needed() if __name__ == "__main__": diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index b92a0a92af..82c3eb059c 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -28,6 +28,7 @@ CompMode, MetricsType, InferenceMode, + RequestType, ModelType, OpType, ParameterSyncType, @@ -36,6 +37,7 @@ ) from flexflow.config import * from .flexflowlib import ffi, flexflow_library +from typing import Union, List def ffc(): @@ -1243,615 +1245,646 @@ def get_weights(self, ffmodel): # ----------------------------------------------------------------------- -# FFModel +# SGDOptimizer # ----------------------------------------------------------------------- -class FFModel(object): - """ """ +class SGDOptimizer(object): + __slots__ = ["handle", "_handle"] - __slots__ = [ - "handle", - "_handle", - "_layers", - "_nb_layers", - "_ffconfig", - "_tracing_id", - "initializers", - "attr_tensors", - ] + def __init__( + self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0 + ): + self.handle = ffc().flexflow_sgd_optimizer_create( + ffmodel.handle, lr, momentum, nesterov, weight_decay + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) - def __init__(self, ffconfig): - """Constructor of FFModel. + def set_learning_rate(self, learning_rate): + ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) - :param ffconfig: configurations of FlexFlow and the created model. - :type ffconfig: FFConfig - :returns: FFModel -- the model. - """ - self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) - self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) - self._layers = dict() - self._nb_layers = 0 - self._ffconfig = ffconfig - global ff_tracing_id - self._tracing_id = ff_tracing_id - ff_tracing_id += 1 - self.initializers = {} - self.attr_tensors = {} +# ----------------------------------------------------------------------- +# AdamOptimizer +# ----------------------------------------------------------------------- - def get_layers(self): - return self._layers - def add_layer(self, op_type, name): - layer_id = self._nb_layers - op_handle = ffc().flexflow_model_get_last_layer(self.handle) - self._layers[self._nb_layers] = convert_op_handle_to_op( - op_type, op_handle, idx=layer_id, name=name +class AdamOptimizer(object): + __slots__ = ["handle", "_handle"] + + def __init__( + self, + ffmodel, + alpha=0.001, + beta1=0.9, + beta2=0.999, + weight_decay=0.0, + epsilon=1e-8, + ): + self.handle = ffc().flexflow_adam_optimizer_create( + ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon ) - self._nb_layers += 1 + self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) - def create_tensor(self, dims, data_type, create_grad=True): - """Instantiate a FlexFlow tensor. + def set_learning_rate(self, learning_rate): + ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) - :param x: a shape tuple/list (integers), including the batch size. - :type x: list of int - :param data_type: the datatype of the created tensor. Options are - DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. - :type data_type: DataType +# ----------------------------------------------------------------------- +# Initializer +# ----------------------------------------------------------------------- +class Initializer(object): + __slots__ = ["handle", "p_handle"] - :param create_grad: weather the tensor creates a gradients vector. - If you don't specify anything, a gradients vector is used. - :type create_grad: bool + def __init__(self, handle, p_handle=0): + self.p_handle = ffi.new("flexflow_initializer_t *") + if handle == None: + self.p_handle.impl = ffi.NULL + else: + self.p_handle.impl = handle.impl + self.handle = self.p_handle[0] + assert ffi.typeof(self.handle) == ffi.typeof( + "flexflow_initializer_t" + ), "Initializer handle is wrong" - :returns: Tensor -- the output tensor. - """ - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_tensor_create( - self.handle, num_dims, c_dims, c_data_type, create_grad - ) - return Tensor(handle) - def map_tensor(self, tensor, parallel_op=None): - op_handle = self.__get_op_handle(parallel_op) - ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) +# ----------------------------------------------------------------------- +# GlorotUniform +# ----------------------------------------------------------------------- - def create_constant(self, dims, value, data_type): - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_constant_create( - self.handle, num_dims, c_dims, value, c_data_type - ) - return Tensor(handle) - def exp(self, x, name=None): - """Exponential activation function. +class GlorotUniformInitializer(Initializer): + __slots__ = ["glorot_handle", "_glorot_handle"] - :param x: the input Tensor. - :type x: Tensor + def __init__(self, seed): + self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) + self._glorot_handle = ffi.gc( + self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy + ) + super(GlorotUniformInitializer, self).__init__(self.glorot_handle) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) - self.add_layer(OpType.EXP, name) - return Tensor(handle, owner_op_type=OpType.EXP) +# ----------------------------------------------------------------------- +# ZeroInitializer +# ----------------------------------------------------------------------- - def sin(self, x, name=None): - """Elementwise sine function. - :param x: the input Tensor. - :type x: Tensor +class ZeroInitializer(Initializer): + __slots__ = ["zero_handle", "_zero_handle"] - :param name: the name of the layer. Default is None. - :type name: string + def __init__(self): + self.zero_handle = ffc().flexflow_zero_initializer_create() + self._zero_handle = ffi.gc( + self.zero_handle, ffc().flexflow_zero_initializer_destroy + ) + super(ZeroInitializer, self).__init__(self.zero_handle) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) - self.add_layer(OpType.SIN, name) - return Tensor(handle, owner_op_type=OpType.SIN) - def cos(self, x, name=None): - """Elementwise cosine function. +# ----------------------------------------------------------------------- +# UniformInitializer +# ----------------------------------------------------------------------- - :param x: the input Tensor. - :type x: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class UniformInitializer(Initializer): + __slots__ = ["uniform_handle", "_uniform_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) - self.add_layer(OpType.COS, name) - return Tensor(handle, owner_op_type=OpType.COS) + def __init__(self, seed, minv, maxv): + self.uniform_handle = ffc().flexflow_uniform_initializer_create( + seed, minv, maxv + ) + self._uniform_handle = ffi.gc( + self.uniform_handle, ffc().flexflow_uniform_initializer_destroy + ) + super(UniformInitializer, self).__init__(self.uniform_handle) - def add(self, x, y, inplace_a=False, name=None): - """Layer that adds two input Tensors, :attr:`output = x + y`. - :param x: the first input Tensor. - :type x: Tensor +# ----------------------------------------------------------------------- +# NormInitializer +# ----------------------------------------------------------------------- - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class NormInitializer(Initializer): + __slots__ = ["norm_handle", "_norm_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_add( - self.handle, x.handle, y.handle, inplace_a, c_name + def __init__(self, seed, mean, stddev): + self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) + self._norm_handle = ffi.gc( + self.norm_handle, ffc().flexflow_norm_initializer_destroy ) - self.add_layer(OpType.ADD, name) - return Tensor(handle, owner_op_type=OpType.ADD) - - def subtract(self, x, y, inplace_a=False, name=None): - """Layer that subtracts two input Tensors, :attr:`output = x * y`. + super(NormInitializer, self).__init__(self.norm_handle) - :param x: the first input Tensor. - :type x: Tensor - :param y: the second input Tensor. - :type y: Tensor +# ----------------------------------------------------------------------- +# PerfMetrics +# ----------------------------------------------------------------------- - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_subtract( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.SUBTRACT, name) - return Tensor(handle, owner_op_type=OpType.SUBTRACT) +class PerfMetrics(object): + __slots__ = ["handle", "_handle"] - def multiply(self, x, y, inplace_a=False, name=None): - """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. + def __init__(self, handle): + self.handle = handle + self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) - :param x: the first input Tensor. - :type x: Tensor + def get_accuracy(self): + return ffc().flexflow_per_metrics_get_accuracy(self.handle) - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +# ----------------------------------------------------------------------- +# NetConfig +# ----------------------------------------------------------------------- - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_multiply( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.MULTIPLY) - def divide(self, x, y, inplace_a=False, name=None): - """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor +class NetConfig(object): + def __init__(self): + self.handle = ffc().flexflow_net_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) + cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cpath) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_divide( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.DIVIDE, name) - return Tensor(handle, owner_op_type=OpType.DIVIDE) +# ----------------------------------------------------------------------- +# DLRMConfig +# ----------------------------------------------------------------------- - def max(self, x, y, inplace_a=False, name=None): - """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. - :param x: the first input Tensor. - :type x: Tensor +class DLRMConfig(object): + def __init__(self): + self.handle = ffc().flexflow_dlrm_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) - :param y: the second input Tensor. - :type y: Tensor + cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cstr) - :param name: the name of the layer. Default is None. - :type name: string + cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) + self.arch_interaction_op = ffi.string(cstr) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_max( - self.handle, x.handle, y.handle, inplace_a, c_name + self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size( + self.handle ) - self.add_layer(OpType.MAX, name) - return Tensor(handle, owner_op_type=OpType.MAX) + self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) + self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) + self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size( + self.handle + ) + self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) - def min(self, x, y, inplace_a=False, name=None): - """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. + mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) + self.mlp_bot = [] + for i in range(0, mlp_bot_c[0]): + self.mlp_bot.append(mlp_bot_c[i + 1]) - :param x: the first input Tensor. - :type x: Tensor + mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) + self.mlp_top = [] + for i in range(0, mlp_top_c[0]): + self.mlp_top.append(mlp_top_c[i + 1]) - :param y: the second input Tensor. - :type y: Tensor + embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) + self.embedding_size = [] + for i in range(0, embedding_size_c[0]): + self.embedding_size.append(embedding_size_c[i + 1]) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_min( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MIN, name) - return Tensor(handle, owner_op_type=OpType.MIN) +# ----------------------------------------------------------------------- +# Single DataLoader +# ----------------------------------------------------------------------- - def reduce_sum(self, input, axes, keepdims=False, name=None): - """Layer that computes the sum of the input Tensor along given axes. - :param input: the input Tensor. - :type input: Tensor +class SingleDataLoader(object): + __slots__ = ["handle", "_handle"] - :param axes: the axes along which reduction is applied - :type axes: List[int] + def __init__(self, ffmodel, input, full_input, num_samples, data_type): + assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" + assert type(input) is Tensor, "SingleDataLoader input is wrong" + if type(full_input) is Tensor: + self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) + else: + self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) + self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) - :param name: the name of the layer. Default is None. - :type name: string + def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): + assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create( + ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type + ) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_reduce_sum( - self.handle, input.handle, c_axes, len(axes), keepdims, c_name + def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): + # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create2( + ffmodel.handle, input.handle, full_input, num_samples, c_data_type ) - self.add_layer(OpType.REDUCE_SUM, name) - return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) - def rsqrt(self, input, name=None): - """Layer that computes the element-wise reciprocal square-root. + @property + def num_samples(self): + return ffc().flexflow_single_dataloader_get_num_samples(self.handle) - :param input: the input Tensor. - :type input: Tensor + @num_samples.setter + def num_samples(self, samples): + ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) - :param name: the name of the layer. Default is None. - :type name: string + def next_batch(self, ffmodel): + """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. - :returns: Tensor -- the output tensor. + :returns: None -- no returns. """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) - self.add_layer(OpType.RSQRT, name) - return Tensor(handle, owner_op_type=OpType.RSQRT) + ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) - def pow(self, input, exponent, name=None): - """Layer that computes the element-wise power. + def reset(self): + """Reset the current position of the dataloder to 0. - :param input: the input Tensor. - :type input: Tensor + :returns: None -- no returns. + """ + ffc().flexflow_single_dataloader_reset(self.handle) - :param exponent: exponent to raise each element in the input tensor. - :type exponent: float - :param name: the name of the layer. Default is None. - :type name: string +class RegionNdarray(object): + __slots__ = ["__array_interface__"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_pow( - self.handle, input.handle, exponent, c_name - ) - self.add_layer(OpType.POW, name) - return Tensor(handle, owner_op_type=OpType.POW) + def __init__(self, shape, data_type, base_ptr, strides, read_only): + # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html + if data_type == DataType.DT_HALF: + field_type = " PEFTModelID: + if peft_model_id not in self.pefts: + raise ValueError( + f"PEFT {peft_model_id} not registered with LLM {self.model_name}" + ) + peft_dict = self.pefts[peft_model_id] + if "ff_peft_model_id" not in peft_dict: + raise RuntimeError( + f"Attempting to run PEFT {peft_model_id} before compiling LLM {self.model_name}" + ) + return peft_dict["ff_peft_model_id"] + def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" - self.config_dir = os.path.join( + config_dir = os.path.join( os.path.expanduser(self.cache_path), "configs", self.model_name.lower() ) - self.config_path = os.path.join(self.config_dir, "config.json") - os.makedirs(self.config_dir, exist_ok=True) - print(f"Creating directory {self.config_dir} (if it doesn't exist)...") - print(f"Saving {self.model_name} configs to file {self.config_path}...") - self.hf_config.to_json_file(self.config_path) + config_path = os.path.join(config_dir, "config.json") + os.makedirs(config_dir, exist_ok=True) + print(f"Creating directory {config_dir} (if it doesn't exist)...") + print(f"Saving {self.model_name} configs to file {config_path}...") + self.hf_config.to_json_file(config_path) + + # Save PEFT configs if the LLM has any registered PEFTs + for peft_model_id, peft_dict in self.pefts.items(): + peft_config = peft_dict["peft_config"] + peft_config_dir = os.path.join( + os.path.expanduser(self.cache_path), + "configs", + peft_model_id.lower(), + ) + os.makedirs(peft_config_dir, exist_ok=True) + peft_config_path = os.path.join(peft_config_dir, "config.json") + print(f"Saving {peft_model_id} configs to file {peft_config_path}...") + with open(peft_config_path, "w") as json_file: + + class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return super().default(obj) - def __get_revision_hashes(self, model_name: str, weights: bool): + json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) + + def __get_revision_hashes(self, model_name: str, folder: str): ff_revision = None - ff_revision_file = ( - os.path.join(self.weights_path, "rev_sha.txt") - if weights - else os.path.join(self.tokenizer_path, "rev_sha.txt") - ) + ff_revision_file = os.path.join(folder, "rev_sha.txt") + if os.path.exists(ff_revision_file): ff_revision = "".join(open(ff_revision_file).read().split()) @@ -180,64 +196,107 @@ def __get_revision_hashes(self, model_name: str, weights: bool): def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. + + If any PEFT adapter is registered, perform the same operation for PEFT. """ - # Use local cache, or download new version - self.weights_path = os.path.join( - os.path.expanduser(self.cache_path), - "weights", - self.model_name.lower(), - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision", - ) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..." - ) - if os.path.exists(self.weights_path): - shutil.rmtree(self.weights_path) - os.makedirs(self.weights_path, exist_ok=True) - print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=True - ) + def get_weights_path(model_name): + return os.path.join( + os.path.expanduser(self.cache_path), + "weights", + model_name.lower(), + ( + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision" + ), + ) - # Download if needed - if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model + def refresh_cache_if_needed(model_name): + weights_path = get_weights_path(model_name) + if self.refresh_cache: print( - f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..." + f"Refreshing weights in cache for model {model_name} at path {weights_path} ..." ) - else: - # Remote model - print( - f"'{self.model_name}' local model weights were updated! Converting new weights now..." - ) - # Download model from HuggingFace, or load it from the local folder - hf_model = AutoModelForCausalLM.from_pretrained( - self.model_name, + if os.path.exists(weights_path): + shutil.rmtree(weights_path) + os.makedirs(weights_path, exist_ok=True) + + def get_hf_llm(model_name): + return AutoModelForCausalLM.from_pretrained( + model_name, trust_remote_code=True, - torch_dtype=torch.float32 - if self.data_type == DataType.DT_FLOAT - else torch.float16, + torch_dtype=( + torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16 + ), ) - # Print log message to notify user download of model has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading HF weights. Converting them now...") - # Convert the model to FlexFlow format - self.model_class.convert_hf_model(hf_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print("Done converting the weights...") - # Deallocate hf model - del hf_model - gc.collect() - torch.cuda.empty_cache() - else: - print(f"Loading '{self.model_name}' model weights from the cache...") + + def download_llm_weights(): + refresh_cache_if_needed(self.model_name) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.model_name, self.weights_path + ) + if ff_revision != latest_revision: + print( + f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." + ) + hf_model = get_hf_llm(self.model_name) + # Convert the model to FlexFlow format + self.model_class.convert_hf_model(hf_model, self.weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {self.model_name}") + # Deallocate hf model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + def convert_peft_model(hf_peft_model, peft_type, weights_path): + for name, params in hf_peft_model.named_parameters(): + if peft_type.lower() in name: + name = name.replace("base_model.model.model.", "").replace( + ".default", "" + ) + name = self.model_class.convert_hf_weight_name(name) + params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + + def download_peft_weights(): + for peft_model_id, peft_dict in self.pefts.items(): + peft_config = peft_dict["peft_config"] + peft_type = peft_dict["peft_type"] + + weights_path = get_weights_path(peft_model_id) + refresh_cache_if_needed(peft_model_id) + ff_revision, ff_revision_file, latest_revision = ( + self.__get_revision_hashes(peft_model_id, weights_path) + ) + + if ff_revision != latest_revision: + print( + f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..." + ) + hf_model = get_hf_llm(peft_model_id) + hf_peft_model = PeftModel.from_pretrained( + hf_model, peft_model_id, config=peft_config + ) + # Convert the model to FlexFlow format + convert_peft_model(hf_peft_model, peft_type, weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {peft_model_id}") + # Deallocate hf model + del hf_peft_model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + self.weights_path = get_weights_path(self.model_name) + download_llm_weights() + download_peft_weights() def download_hf_tokenizer_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date. @@ -253,7 +312,7 @@ def download_hf_tokenizer_if_needed(self): ) if self.refresh_cache: print( - f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..." + f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..." ) if os.path.exists(self.tokenizer_path): shutil.rmtree(self.tokenizer_path) @@ -263,20 +322,13 @@ def download_hf_tokenizer_if_needed(self): # Get local revision SHA, check if it matches latest one on huggingface ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=False + self.model_name, self.tokenizer_path ) if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model - print( - f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." - ) - else: - # Remote model - print( - f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..." - ) + print( + f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." + ) # Download tokenizer from HuggingFace, or load it from the local folder if self.model_type == ModelType.LLAMA: hf_tokenizer = LlamaTokenizer.from_pretrained( @@ -284,19 +336,13 @@ def download_hf_tokenizer_if_needed(self): ) else: hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) - # Print log message to notify user download of tokenizer has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading tokenizer. Saving it now...") # Save tokenizer hf_tokenizer.save_pretrained(self.tokenizer_path) - print("Done saving HF tokenizer.") + print("Done updating HF tokenizer.") # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) - else: - print(f"Loading '{self.model_name}' tokenizer from the cache...") - def compile( self, generation_config: GenerationConfig = GenerationConfig(), @@ -374,6 +420,15 @@ def compile( max_tokens_per_batch, ) + # Add PEFT layer if registered + for peft_model_id, peft_dict in self.pefts.items(): + # ff_peft_config = peft_dict["ff_peft_config"] + ff_peft_config = LoraLinearConfig( + os.path.expanduser(self.cache_path), peft_model_id + ) + ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config) + peft_dict["ff_peft_model_id"] = ff_peft_model_id + # Download the weights from huggingface (if needed) self.download_hf_weights_if_needed() @@ -420,22 +475,36 @@ def compile( atexit.register(self.rm.stop_server) - def generate(self, prompts: Union[str, List[str]], max_length: int = 128): + def generate( + self, + requests_or_prompts: Union[str, List[str], Request, List[Request]], + max_length: int = 128, + ): """Generate tokens based on the input prompt(s) - :param prompts: The generation prompt(s) in the form of a string, or list of strings - :type prompts: Union[str, List[str]] + :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests + :type requests_or_prompts: Union[str, List[str], Request, List[Request]] :return: the generation results :rtype: GenerationResult """ - if type(prompts) == str: - if len(prompts) == 0: + if type(requests_or_prompts) == str: + if len(requests_or_prompts) == 0: return None - return self.model.ffmodel.generate([prompts], max_length) - elif type(prompts) == list: - if len(prompts) == 0: + return self.model.ffmodel.generate_inf_only( + [requests_or_prompts], max_length + ) + elif type(requests_or_prompts) == Request: + return self.model.ffmodel.generate(requests_or_prompts) + elif type(requests_or_prompts) == list: + if len(requests_or_prompts) == 0: return [] - return self.model.ffmodel.generate(prompts, max_length) + if type(requests_or_prompts[0]) == str: + return self.model.ffmodel.generate_inf_only( + requests_or_prompts, max_length + ) + else: + print(requests_or_prompts) + return self.model.ffmodel.generate(requests_or_prompts) else: assert False, "Please pass a non-empty string or list of strings" @@ -447,17 +516,6 @@ def stop_server(self): self.rm.stop_server() print("Background server stopped.") - def __enter__(self): - # Start the server when entering the context - # self.rm.start_server(self.model.ffmodel) - return self - - def __exit__(self, exc_type, exc_value, traceback): - # Stop the server when exiting the context - # self.rm.stop_server() - if exc_type: - print(f"Exception occurred: {exc_value}") - class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" @@ -533,152 +591,3 @@ def compile( model_specific_pipeline_parallelism_degree, ssms, ) - - -class PEFT: - """This class creates a PEFT (parameter-efficient transformer) object to be used in concert with a LLM or SSM""" - - def __init__( - self, - peft_model_id: str, - data_type: DataType = DataType.DT_HALF, - cache_path: str = "", - refresh_cache: bool = False, - ): - self.hf_config = PeftConfig.from_pretrained(peft_model_id) - self.peft_model_id = peft_model_id - self.peft_type = self.hf_config.peft_type - if self.peft_type != "LORA": - raise RuntimeError( - f"PEFT type {self.peft_type} not yet supported in FlexFlow" - ) - self.data_type = data_type - assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT - self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" - self.refresh_cache = refresh_cache - # Base model related - if "base_model_name_or_path" not in self.hf_config.to_dict(): - raise ValueError( - f"PEFT model {peft_model_id} does not have an associated based model" - ) - self.base_model = LLM( - self.hf_config.base_model_name_or_path, data_type, cache_path, refresh_cache - ) - - def download_hf_config(self): - """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" - self.config_dir = os.path.join( - os.path.expanduser(self.cache_path), "configs", self.peft_model_id.lower() - ) - self.config_path = os.path.join(self.config_dir, "config.json") - os.makedirs(self.config_dir, exist_ok=True) - print(f"Creating directory {self.config_dir} (if it doesn't exist)...") - print(f"Saving {self.peft_model_id} configs to file {self.config_path}...") - with open(self.config_path, "w") as json_file: - class SetEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - return list(obj) - return super().default(obj) - json.dump(self.hf_config.to_dict(), json_file, indent=2, cls=SetEncoder) - - def __get_revision_hashes(self, peft_model_id: str): - ff_revision = None - ff_revision_file = os.path.join(self.weights_path, "rev_sha.txt") - if os.path.exists(ff_revision_file): - ff_revision = "".join(open(ff_revision_file).read().split()) - - if os.path.exists(peft_model_id) and os.path.isdir(peft_model_id): - # Local model - files = os.listdir(peft_model_id) - state = files + [ - os.path.getmtime(os.path.join(peft_model_id, f)) for f in files - ] - latest_revision = hashlib.md5(str(state).encode("utf-8")).hexdigest() - else: - # Remote HuggingFace model - hf_api = HfApi() - latest_revision = hf_api.model_info(self.peft_model_id).sha - return ff_revision, ff_revision_file, latest_revision - - def convert_peft_model(self, hf_peft_model, weights_path): - for name, params in hf_peft_model.named_parameters(): - if self.peft_type.lower() in name: - name = name.replace("base_model.model.model.", "").replace( - ".default", "" - ) - name = self.base_model.model_class.convert_hf_weight_name(name) - params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") - - def download_hf_weights_if_needed(self): - """Check in the folder specified by the cache_path whether the PEFT's model weights are available and up to date. - If not, or if the refresh_cache parameter is set to True, download new weights. - """ - # Use local cache, or download new version - self.weights_path = os.path.join( - os.path.expanduser(self.cache_path), - "weights", - self.peft_model_id.lower(), - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision", - ) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {self.peft_model_id} at path {self.weights_path} ..." - ) - if os.path.exists(self.weights_path): - shutil.rmtree(self.weights_path) - os.makedirs(self.weights_path, exist_ok=True) - print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.peft_model_id - ) - - # Download if needed - if ff_revision != latest_revision: - if not os.path.exists(self.peft_model_id) or os.path.isdir( - self.peft_model_id - ): - # Local model - print( - f"'{self.peft_model_id}' model weights not found in cache or outdated. Downloading from huggingface.co ..." - ) - else: - # Remote model - print( - f"'{self.peft_model_id}' local model weights were updated! Converting new weights now..." - ) - # Download base model from HuggingFace, or load it from the local folder - self.base_model.download_hf_weights_if_needed() - self.base_model.download_hf_tokenizer_if_needed() - self.base_model.download_hf_config() - hf_base_model = AutoModelForCausalLM.from_pretrained( - self.hf_config.base_model_name_or_path, - return_dict=True, - trust_remote_code=True, - torch_dtype=torch.float32 - if self.data_type == DataType.DT_FLOAT - else torch.float16, - # device_map="auto", - ) - hf_peft_model = PeftModel.from_pretrained(hf_base_model, self.peft_model_id) - # Print log message to notify user download of model has finished - if not os.path.exists(self.peft_model_id) or os.path.isdir( - self.peft_model_id - ): - print("Done downloading HF weights. Converting them now...") - # Convert the model to FlexFlow format - self.convert_peft_model(hf_peft_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print("Done converting the weights...") - # Deallocate hf model - del hf_peft_model - del hf_base_model - gc.collect() - torch.cuda.empty_cache() - else: - print(f"Loading '{self.peft_model_id}' model weights from the cache...") diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 994a85f57e..ac6975b4fd 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -152,6 +152,9 @@ class OpType(Enum): RESIDUAL_RMS_NORM = 2305 RESIDUAL_LAYERNORM = 2306 +class RequestType(Enum): + REQ_INFERENCE = 4001 + REQ_FINETUNING = 4002 def enum_to_int(enum, enum_item): for item in enum: diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 58acf3d010..cb8433c2c6 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -67,6 +67,8 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *); FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *); FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *); + FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *); }; Logger ffc_log("flexflow_c"); @@ -1542,6 +1544,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, + const flexflow_lora_linear_config_t peft_config_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); + PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config); + + DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, " + "peft_model_id: %p", + handle, + peft_config, + peft_model_id); + return FFCObjectWrapper::wrap(peft_model_id); +} + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1597,43 +1614,74 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { void flexflow_model_generate(flexflow_model_t handle_, int num_requests, + enum RequestType *request_types, char const **input_texts, - int max_num_chars, char **output_texts, - int max_seq_length, + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps, int **output_length_and_tokens) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); std::vector requests; + + int finetuning_req_idx = 0; for (int i = 0; i < num_requests; i++) { - std::string const text_str(input_texts[i]); - Request inference_req; - inference_req.prompt = text_str; - inference_req.max_sequence_length = max_seq_length; - requests.push_back(inference_req); - DEBUG_PRINT("[Model] generate[%d] %p %s %i", - i, - handle, - text_str.c_str(), - max_seq_length); + if (request_types[i] == RequestType::REQ_INFERENCE) { + std::string const text_str(input_texts[i]); + Request inference_req; + inference_req.prompt = text_str; + inference_req.max_sequence_length = max_seq_lengths[i]; + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { + inference_req.peft_model_id = *peft_model_id; + } + requests.push_back(inference_req); + DEBUG_PRINT("[Model] generate[%d] %p %s %i", + i, + handle, + text_str.c_str(), + max_seq_lengths[i]); + } else { + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = max_seq_lengths[i]; + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { + fine_tuning_req.peft_model_id = *peft_model_id; + } + std::string const dataset_fp(dataset_filepaths[finetuning_req_idx]); + fine_tuning_req.dataset_filepath = dataset_fp; + fine_tuning_req.max_training_steps = training_steps[finetuning_req_idx]; + requests.push_back(fine_tuning_req); + DEBUG_PRINT("[Model] generate[%d] %p %s %i %i", + i, + handle, + dataset_fp.c_str(), + max_seq_lengths[i], + training_steps[finetuning_req_idx]); + finetuning_req_idx++; + } } std::vector results = handle->generate(requests); - // If the prompt exceeds max seq len, check that we return the prompt with no - // additional token. Otherwise, check that the output does not exceed the max - // sequence length. for (int i = 0; i < num_requests; i++) { - assert(results[i].output_tokens.size() <= max_seq_length || - results[i].output_tokens.size() == results[i].input_tokens.size()); - output_length_and_tokens[i][0] = results[i].output_tokens.size(); - std::copy(results[i].output_tokens.begin(), - results[i].output_tokens.end(), - output_length_and_tokens[i] + 1); - std::memcpy(output_texts[i], - results[i].output_text.c_str(), - results[i].output_text.length()); + if (request_types[i] == RequestType::REQ_INFERENCE) { + // If the prompt exceeds max seq len, check that we return the prompt with + // no additional token. Otherwise, check that the output does not exceed + // the max sequence length. + assert(results[i].output_tokens.size() <= max_seq_lengths[i] || + results[i].output_tokens.size() == results[i].input_tokens.size()); + output_length_and_tokens[i][0] = results[i].output_tokens.size(); + std::copy(results[i].output_tokens.begin(), + results[i].output_tokens.end(), + output_length_and_tokens[i] + 1); + std::memcpy(output_texts[i], + results[i].output_text.c_str(), + results[i].output_text.length()); + } } - // return FFCObjectWrapper::wrap(&results[0]); } void flexflow_model_set_position_offset(flexflow_model_t handle_, @@ -2739,3 +2787,50 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, FFModel *model = FFCObjectWrapper::unwrap(model_handle_); handle->load_weights(model); } + +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_) { + assert(cache_folder_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(peft_model_id_ != nullptr && + "Cannot convert nullptr char * to std::string"); + std::string const cache_folder(cache_folder_); + std::string const peft_model_id(peft_model_id_); + LoraLinearConfig *handle = new LoraLinearConfig(cache_folder, peft_model_id); + DEBUG_PRINT("[LoraLinearConfig] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_lora_linear_config_destroy( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config); + delete peft_config; +} + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create() { + PEFTModelID *handle = new PEFTModelID(); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) { + PEFTModelID *handle = new PEFTModelID(id); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id); + delete peft_model_id; +} diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 574fbcb573..aca93a973d 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -266,8 +266,7 @@ __host__ void batch_size); break; } - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); Domain input_domain = my_input_accessor[0].domain; @@ -910,8 +909,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, num_peft_tokens); break; } - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { assert(fused->op_num_inputs[op] == 2); assert(fused->op_num_outputs[op] == 1); Domain input_domain = my_input_grad_accessor[0].domain; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 83fdbaf927..8b0776fde4 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1488,7 +1488,8 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 366eca27b7..170e087226 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -38,51 +38,132 @@ using Legion::TaskLauncher; using namespace FlexFlow::Kernels::LoraLinear; -void FFModel::lora_linear(Tensor const input, - Tensor const output, - OperatorType op_type, - char const *name) { - assert(input->data_type == output->data_type); - Layer *lora = nullptr; - lora = new Layer(this, - op_type, - output->data_type, - name, - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - output); - { - int numdims = output->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = output->dims[i]; +bool check_lora_layer_match(Layer *potential_target, + std::string target_module_name) { + if (potential_target->op_type == OP_LINEAR && + potential_target->name != nullptr && strlen(potential_target->name) > 0) { + std::string s(potential_target->name); + if (s.find(target_module_name) != std::string::npos && + s.find("lora") == std::string::npos) { + return true; } - lora->outputs[0] = create_tensor_legion_ordering( - numdims, dims, output->data_type, lora, 0, true /*create_grad*/); } - layers.push_back(lora); + return false; +} + +PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { + assert(config.enable_peft && + "Cannot add a LoRA layer if PEFT mode is not enabled"); + if (peft_config.target_modules.size() == 0) { + printf("PEFT config does not contain any target module\n"); + return nullptr; + } + PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); + peft_configs[*peft_model_id] = peft_config; + + for (std::string target_module_name : peft_config.target_modules) { + assert(target_module_name.length() > 0 && + "LoRA target module name is empty"); + // find target layer + for (auto it = layers.begin(); it != layers.end(); ++it) { + Layer *target_module = *it; + bool match = check_lora_layer_match(target_module, target_module_name); + if (!match) { + continue; + } + + if (base_layer_to_peft_layer.find(target_module) != + base_layer_to_peft_layer.end()) { + // lora linear layer already added, no need to add again + Layer *peft_layer = base_layer_to_peft_layer[target_module]; + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); + } else { + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + std::string name_ = target_module->name + ? std::string(target_module->name) + : std::string(""); + size_t last_underscore = name_.length() - 1; + for (int i = name_.length() - 1; i > 0; i--) { + if (!(std::isdigit(target_module->name[i]) || + target_module->name[i] == '_')) { + break; + } else if (target_module->name[i] == '_') { + last_underscore = i; + } + } + name_.erase(last_underscore); + + name_ += ".lora"; + std::cout << "Adding layer " << name_ << std::endl; + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + peft_layer->outputs[0] = + create_tensor_legion_ordering(numdims, + dims, + output->data_type, + peft_layer, + 0, + true /*create_grad*/); + } + layers.insert(it + 1, peft_layer); + ++it; + base_layer_to_peft_layer[target_module] = peft_layer; + peft_layer_to_peft_id[peft_layer] = std::vector(); + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); + } + } + } + + return peft_model_id; } Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { + std::unordered_map _peft_configs; + std::vector const &peft_ids = + model.peft_layer_to_peft_id[(Layer *)layer]; + for (int i = 0; i < peft_ids.size(); i++) { + _peft_configs.emplace( + std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]])); + } return new LoraLinear(model, layer->layer_guid, layer->op_type, inputs[0], inputs[1], + _peft_configs, layer->name); + ; } LoraLinear::LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, ParallelTensor const output) - : LoraLinear( - model, other.layer_guid, other.op_type, input, output, other.name) {} + : LoraLinear(model, + other.layer_guid, + other.op_type, + input, + output, + other.peft_configs, + other.name) {} LoraLinear::LoraLinear(FFModel &model, Params const ¶ms, @@ -93,14 +174,17 @@ LoraLinear::LoraLinear(FFModel &model, params.type, inputs.first, inputs.second, + params.peft_configs, params.name) {} -LoraLinear::LoraLinear(FFModel &model, - LayerID const &_layer_guid, - OperatorType _op_type, - ParallelTensor const _input, - ParallelTensor const _output, - char const *name) +LoraLinear::LoraLinear( + FFModel &model, + LayerID const &_layer_guid, + OperatorType _op_type, + ParallelTensor const _input, + ParallelTensor const _output, + std::unordered_map const &_peft_configs, + char const *name) : Op(model, _op_type, _output->data_type, @@ -129,6 +213,9 @@ LoraLinear::LoraLinear(FFModel &model, outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, inputs[1]->data_type, this); } + for (auto const &kv : _peft_configs) { + peft_configs.insert(kv); + } // assert(check_output_input_weight_parallel_dims(allocate_weights)); } @@ -183,6 +270,32 @@ void LoraLinear::init_inference( set_opmeta_from_futuremap_inference(ff, fm, output_tensor); } +template +void load_peft_from_file( + DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(size); + size_t target_data_size = sizeof(DT) * size; + in.seekg(sharded * shard_id * target_data_size, in.beg); + in.read((char *)host_array.data(), target_data_size); + + size_t in_get_size = in.gcount(); + if (in_get_size != target_data_size) { + printf("load weight data error: %lu, %lu, %lu\n", + in_get_size, + target_data_size, + sizeof(DT)); + assert(false); + } + assert(size == host_array.size()); + copy_tensor_host_to_dev(ptr, host_array.data(), size); + in.close(); +} + /* regions[0](O): output regions[1](I): kernel @@ -219,97 +332,12 @@ OpMeta *LoraLinear::init_task(Task const *task, std::strcpy(m->op_name, lora->name); m->layer_guid = lora->layer_guid; - return m; -} - -struct LoraLinearRegisterInfo { - LoraLinear const *lora; - PEFTModelID model_id; - LoraLinearConfig lora_config; -}; - -void LoraLinear::register_peft_model( - FFModel const &ff, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - PEFTModelID const &model_id, - LoraLinearConfig const lora_config) { - assert(check_output_input_weight_same_parallel_is()); - assert(batch_inputs.size() == 2); - assert(batch_outputs.size() == 1); - // Assert that the output and the second input are mapped to the same - // region/part - assert(batch_outputs[0]->region == batch_inputs[1]->region); - assert(batch_outputs[0]->part == batch_inputs[1]->part); - // assert(check_output_input_weight_same_machine_view()); - // output is considered as an input to allow in-place optimization - ParallelTensor output_tensor = batch_outputs[0]; - parallel_is = output_tensor->parallel_is; - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - MachineView const *view = &output_tensor->machine_view; - size_t machine_view_hash = view->hash(); - set_argumentmap_for_inference(ff, argmap, output_tensor); - LoraLinearRegisterInfo info; - info.lora = this; - info.model_id = model_id; - info.lora_config = lora_config; - IndexLauncher launcher(LORA_LINEAR_REG_TASK_ID, - parallel_is, - TaskArgument(&info, sizeof(LoraLinearRegisterInfo)), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - FutureMap fm = runtime->execute_index_space(ctx, launcher); - fm.wait_all_results(); -} - -template -void load_peft_from_file( - DT *ptr, size_t size, bool sharded, int shard_id, std::string filepath) { - std::ifstream in(filepath, std::ios::in | std::ios::binary); - if (!in.good()) { - printf("Could not open file: %s\n", filepath.c_str()); - } - assert(in.good() && "incorrect weight file path"); - std::vector
host_array(size); - size_t target_data_size = sizeof(DT) * size; - in.seekg(sharded * shard_id * target_data_size, in.beg); - in.read((char *)host_array.data(), target_data_size); - - size_t in_get_size = in.gcount(); - if (in_get_size != target_data_size) { - printf("load weight data error: %lu, %lu, %lu\n", - in_get_size, - target_data_size, - sizeof(DT)); - assert(false); - } - assert(size == host_array.size()); - copy_tensor_host_to_dev(ptr, host_array.data(), size); - in.close(); -} - -void LoraLinear::register_model_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - LoraLinearRegisterInfo const *info = - static_cast(task->args); - LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); - LoraLinear const *lora = info->lora; - int shard_id = task->index_point.point_data[0]; - - int rank = info->lora_config.rank; int num_dims = lora->inputs[0]->num_dims; - int in_dim = lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree; - int out_dim = lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree; - int w0_num_elements = rank * in_dim; - int w1_num_elements = rank * out_dim; + assert(in_dim == + lora->inputs[0]->dims[0].size / lora->inputs[0]->dims[0].degree); + assert(out_dim == + lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree); DataType dt = m->input_type[0]; assert(dt == m->input_type[1]); @@ -317,17 +345,6 @@ void LoraLinear::register_model_task(Task const *task, assert(dt == lora->inputs[0]->data_type); assert(dt == lora->inputs[1]->data_type); assert(dt == lora->outputs[0]->data_type); - assert(m->model_weights.find(info->model_id) == m->model_weights.end()); - - LoraLinearWeight weight; - weight.in_dim = in_dim; - weight.out_dim = out_dim; - weight.rank = rank; - PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; - weight.w0_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w0_num_elements * data_type_size(dt)); - weight.w1_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w1_num_elements * data_type_size(dt)); // get layer name assert(lora->name != nullptr && @@ -344,61 +361,87 @@ void LoraLinear::register_model_task(Task const *task, std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); - // load weights from file - std::string weights_folder_filepath = join_path({ - info->lora_config.cache_folder, - "weights", - info->lora_config.peft_model_id, - dt == DT_FLOAT ? "full-precision" : "half-precision", - }); - std::string w0_filepath = - join_path({weights_folder_filepath, lora_layername_substr + "_A_weight"}); - std::string w1_filepath = - join_path({weights_folder_filepath, lora_layername_substr + "_B_weight"}); - if (dt == DT_FLOAT) { - std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" - << ", size: " << w0_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); - std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" - << ", size: " << w1_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (float *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); - } else if (dt == DT_HALF) { - std::cout << "Loading LORA weight " << lora_layername_substr + "_A_weight" - << ", size: " << w0_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); - std::cout << "Loading LORA weight " << lora_layername_substr + "_B_weight" - << ", size: " << w1_num_elements << ", shard: " << shard_id - << std::endl; - load_peft_from_file( - (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); - } else { - assert(false && "Data type not supported"); - } + for (auto const &kv : lora->peft_configs) { + PEFTModelID const &model_id = kv.first; + LoraLinearConfig const &lora_config = kv.second; + + int rank = lora_config.rank; + + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + LoraLinearWeight weight; + weight.in_dim = in_dim; + weight.out_dim = out_dim; + weight.rank = rank; + PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; + weight.w0_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + + // load weights from file + std::string weights_folder_filepath = join_path({ + lora_config.cache_folder, + "weights", + lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_B.weight"}); + if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" + << ", size: " << w0_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file( + (float *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" + << ", size: " << w1_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file((float *)weight.w1_ptr, + w1_num_elements, + false, + shard_id, + w1_filepath); + } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" + << ", size: " << w0_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file( + (half *)weight.w0_ptr, w0_num_elements, true, shard_id, w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" + << ", size: " << w1_num_elements << ", shard: " << shard_id + << std::endl; + load_peft_from_file( + (half *)weight.w1_ptr, w1_num_elements, false, shard_id, w1_filepath); + } else { + assert(false && "Data type not supported"); + } - if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { - // Input is partitioned (no replication) - // w0_grad is local weight gradients - weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is sync weight gradients - weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( - info->model_id, w1_num_elements * data_type_size(dt)); - } else { - // Input is replicated - // w0_grad is sync weight gradients - weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( - info->model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is local weight gradients - weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( - info->model_id, w1_num_elements * data_type_size(dt)); + if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { + // Input is partitioned (no replication) + // w0_grad is local weight gradients + weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is sync weight gradients + weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } else { + // Input is replicated + // w0_grad is sync weight gradients + weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is local weight gradients + weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } + assert(m->model_weights.find(model_id) == m->model_weights.end()); + m->model_weights[model_id] = weight; } - m->model_weights[info->model_id] = weight; + + return m; } void LoraLinear::forward(FFModel const &ff) { @@ -761,7 +804,17 @@ bool LoraLinear::measure_operator_cost(Simulator *sim, } bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type; + if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && + lhs.peft_configs.size() == rhs.peft_configs.size()) { + for (auto const &kv : lhs.peft_configs) { + auto it = rhs.peft_configs.find(kv.first); + if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { + return false; + } + } + return true; + } + return false; } void LoraLinear::serialize(Legion::Serializer &sez) const { @@ -769,6 +822,19 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); sez.serialize(this->op_type); + sez.serialize(this->peft_configs.size()); + for (auto const &kv : this->peft_configs) { + // Serialize PEFTModelID + sez.serialize(kv.first.id); + // Serialize LoraConfig's cache folder + sez.serialize(kv.second.cache_folder.length()); + sez.serialize(kv.second.cache_folder.c_str(), + kv.second.cache_folder.length()); + // Serialize LoraConfig's peft model id + sez.serialize(kv.second.peft_model_id.length()); + sez.serialize(kv.second.peft_model_id.c_str(), + kv.second.peft_model_id.length()); + } sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -782,17 +848,45 @@ Node LoraLinear::deserialize(FFModel &ff, assert(num_inputs == 2); size_t id, transformer_layer_id, deserialized_model_id; OperatorType op_type; + size_t num_pefts; size_t name_len; char name[MAX_OPNAME] = {0}; + + LoraLinearParams params; + dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); dez.deserialize(op_type); + dez.deserialize(num_pefts); + for (int i = 0; i < num_pefts; i++) { + // Deserialize PEFTModelID + size_t pid; + dez.deserialize(pid); + PEFTModelID peft_model_id(pid); + + // Deserialize LoraConfig's cache folder + size_t string_size; + char buffer[4096] = {0}; + dez.deserialize(string_size); + dez.deserialize(buffer, string_size); + std::string cache_folder = std::string(buffer); + + // Deserialize LoraConfig's peft model id + string_size = 0; + memset(buffer, 0, 4096); + dez.deserialize(string_size); + dez.deserialize(buffer, string_size); + std::string peft_model_name = std::string(buffer); + + LoraLinearConfig lora_linear_config(cache_folder, peft_model_name); + params.peft_configs.emplace( + std::make_pair(peft_model_id, lora_linear_config)); + } dez.deserialize(name_len); dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); - LoraLinearParams params; params.layer_guid = layer_guid; params.type = op_type; strcpy(params.name, name); @@ -813,6 +907,7 @@ LoraLinearParams LoraLinear::get_params() const { if (this->name != nullptr) { strcpy(params.name, this->name); } + params.peft_configs = this->peft_configs; return params; } @@ -831,6 +926,18 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); hash_combine(key, params.layer_guid.model_id); + for (auto const &kv : params.peft_configs) { + hash_combine(key, kv.first.id); + hash_combine(key, kv.second.rank); + hash_combine(key, kv.second.optimizer_type); + hash_combine(key, kv.second.learning_rate); + hash_combine(key, kv.second.cache_folder); + hash_combine(key, kv.second.peft_model_id); + hash_combine(key, kv.second.lora_alpha); + hash_combine(key, kv.second.lora_dropout); + hash_combine(key, kv.second.target_modules); + hash_combine(key, kv.second.load_weights_from_file); + } return key; } }; // namespace std diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 9d797aaed2..1b142d5577 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -5,7 +5,7 @@ using json = nlohmann::json; namespace FlexFlow { -const LoraLinearConfig LoraLinearConfig::DefaultConfig = LoraLinearConfig(); +const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig(); LoraLinearConfig::LoraLinearConfig() : rank(0), optimizer_type(OPTIMIZER_TYPE_NONE), learning_rate(0.0f), @@ -31,6 +31,9 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_, rank = model_config["r"]; lora_alpha = model_config["lora_alpha"]; lora_dropout = model_config["lora_dropout"]; + for (auto &s : model_config["target_modules"]) { + target_modules.push_back(s); + } } catch (json::exception const &e) { std::cerr << "Error parsing PEFT config from JSON file: " << e.what() << std::endl; @@ -48,14 +51,25 @@ LoraLinearConfig::LoraLinearConfig(std::string const &cache_folder_, bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { if (lhs.rank == rhs.rank && lhs.optimizer_type == rhs.optimizer_type && - lhs.learning_rate == rhs.learning_rate) { + lhs.learning_rate == rhs.learning_rate && + lhs.cache_folder == rhs.cache_folder && + lhs.peft_model_id == rhs.peft_model_id && + lhs.lora_alpha == rhs.lora_alpha && + lhs.lora_dropout == rhs.lora_dropout && + lhs.target_modules.size() == rhs.target_modules.size() && + lhs.load_weights_from_file == rhs.load_weights_from_file) { + for (int i = 0; i < lhs.target_modules.size(); i++) { + if (lhs.target_modules[i] != rhs.target_modules[i]) { + return false; + } + } return true; } return false; } std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { - os << "LoraLinearConfig: "; + os << "LoraLinearConfig: {"; os << "rank: " << llc.rank << ", "; os << "optimizer_type: " << llc.optimizer_type << ", "; os << "learning_rate: " << llc.learning_rate << ", "; @@ -63,6 +77,14 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { os << "peft_model_id: " << llc.peft_model_id << ", "; os << "lora_alpha: " << llc.lora_alpha << ", "; os << "lora_dropout: " << llc.lora_dropout << ", "; + os << "target_modules: ["; + for (int i = 0; i < llc.target_modules.size(); i++) { + os << llc.target_modules[i]; + if (i < llc.target_modules.size() - 1) { + os << ", "; + } + } + os << "], "; os << "load_weights_from_file: " << llc.load_weights_from_file << std::endl; return os; } diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index 3ee1ee62df..33e11bf451 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -189,10 +189,8 @@ std::string get_operator_type_name(OperatorType type) { case OP_ARGMAX: return "ArgMax"; // PEFT Ops - case OP_LORA_MLP_FIRST: - return "Lora MLP First Layer"; - case OP_LORA_MLP_SECOND: - return "Lora MLP Second Layer"; + case OP_LORA: + return "Lora Layer"; // Parallel Ops case OP_REPARTITION: return "Repartition"; diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index fa19c9b22d..84554c2bd4 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr, bool final_bias, std::string layer_name, std::string weights_folder) { - std::string q_file = layer_name + "_wq_bias"; - std::string k_file = layer_name + "_wk_bias"; - std::string v_file = layer_name + "_wv_bias"; + std::string q_file = layer_name + ".q_proj.bias"; + std::string k_file = layer_name + ".k_proj.bias"; + std::string v_file = layer_name + ".v_proj.bias"; std::vector bias_files = {q_file, k_file, v_file}; if (final_bias) { - std::string o_file = layer_name + "_wo_bias"; + std::string o_file = layer_name + ".o_proj.bias"; bias_files.push_back(o_file); } @@ -217,12 +217,10 @@ void load_attention_weights_v2(DT *ptr, std::string weights_folder, size_t volume, int tensor_parallelism_degree) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file}; int file_index = 0; @@ -407,12 +405,10 @@ void load_attention_weights_quantized(char *ptr, std::string weights_folder, DataType data_type, bool use_full_precision) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file, o_file}; int file_index = 0; @@ -690,7 +686,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, if (weight_idx > 0) { assert(weight_idx == 0 || weight_idx == 1); if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } } load_from_quantized_file(data, @@ -728,44 +724,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, weight_filename, weights_folder, hidden_dim, num_heads); - } else if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); - } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); - } - + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); } else { - assert(false); + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); } } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); weight_filename += (weight_idx == 0) - ? "_attn_bias" - : ((weight_idx == 1) ? "_weight" : "_bias"); + ? ".attn_bias" + : ((weight_idx == 1) ? ".weight" : ".bias"); std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); load_from_file(data, volume, weight_filepath); @@ -774,7 +760,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, assert(weight_idx == 0 || weight_idx == 1); // handle exception if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); @@ -801,7 +787,7 @@ void FileDataLoader::load_weights(FFModel *ff) { continue; } // TODO: currently skip Lora layers - if (l->op_type == OP_LORA_MLP_FIRST || l->op_type == OP_LORA_MLP_SECOND) { + if (l->op_type == OP_LORA) { continue; } switch (weight->data_type) { diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 31cf3bb6a7..dae0021bb6 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2764,8 +2764,7 @@ void FFModel::deserialize_graph_optimal_view( node = Linear::deserialize(*this, dez, inputs, num_inputs); break; } - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { node = LoraLinear::deserialize(*this, dez, inputs, num_inputs); break; } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 91a6dab9b5..212d0ebf6b 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -54,10 +54,31 @@ bool parallel_tensor_list_overlaps(std::vector const &list1, } void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { + + // Check if the model object exists + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl; + } + // TODO: currently assume there is a single data-parallel pipeline // (i.e., data-parallel-degree == 1) assert(model->config.data_parallelism_degree == 1); model->config.batchSize = BatchConfig::max_tokens_per_batch(); + + // Check if the model object exists after importing config + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist after " + "setting config and batch size." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl; + } + model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; @@ -609,17 +630,26 @@ void FFModel::set_position_offset(int offset) { } void FFModel::compile_inference() { + std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl; + // Request at least four CPU processors for inference runs assert( config.cpusPerNode >= 4 && "FlexFlow Serve requires at least four CPU cores per node, please add " "`-ll:cpu 4` in the command line if you are using the C++ interface or " "set `num_cpus` in `ff.init` if you are using the Python interface"); + + std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four " + "CPU cores per node." + << std::endl; Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; create_operators_from_layers(); + // Launch the graph optimize task + std::cout << "###PEFT DEBUGGING### Launching graph optimization task." + << std::endl; { FFModel *model = this; TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, @@ -670,6 +700,14 @@ void FFModel::compile_inference() { } } } + + std::cout + << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." + << std::endl; + // Perform inplace optimizations + std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." + << std::endl; + loss_op = nullptr; metrics_op = nullptr; // Perform inplace optimizations @@ -709,6 +747,8 @@ void FFModel::compile_inference() { } } + // Output tensor mapping + std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl; for (size_t l = 0; l < operators.size(); l++) { Op *op = operators[l]; @@ -734,6 +774,8 @@ void FFModel::compile_inference() { } #ifdef FF_USE_NCCL + std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." + << std::endl; for (size_t l = 0; l < operators.size(); l++) { // Only create nccl for allreduce and fusedop for inference // (fusedop may include allreduces) @@ -770,6 +812,8 @@ void FFModel::compile_inference() { } } #endif + std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." + << std::endl; } std::string join_path(std::vector const &paths) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index a64fb8ec9c..63016d0c8b 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3308,8 +3308,7 @@ Op *FFModel::create_operator_from_layer( return op; } // PEFT layers - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: { + case OP_LORA: { Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs); operators.push_back(op); return op; @@ -6697,22 +6696,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - { - TaskVariantRegistrar registrar(LORA_LINEAR_REG_TASK_ID, - "LoraLinear Model Registration"); - registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); - registrar.set_leaf(); - if (pre_register) { - Runtime::preregister_task_variant( - registrar, "LoraLinear Model Registration Task"); - } else { - if (enable_control_replication) { - registrar.global_registration = false; - } - runtime->register_task_variant( - registrar); - } - } { TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID, "LoraLinear Inference"); diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 41c371d4e2..9dc0361316 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ namespace FlexFlow { using namespace Legion; using tokenizers::Tokenizer; +using json = nlohmann::json; LegionRuntime::Logger::Category log_req_mgr("RequestManager"); @@ -45,6 +47,48 @@ std::string LoadBytesFromFile(std::string const &path) { return data; } +std::ostream &operator<<(std::ostream &os, Request const &req) { + os << "Request {\n"; + os << " guid: " << req.guid << "\n"; + os << " peft_model_id: " << req.peft_model_id << "\n"; + os << " max_sequence_length: " << req.max_sequence_length << "\n"; + os << " initial_len: " << req.initial_len << "\n"; + os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; + os << " llm_cache_size: " << req.llm_cache_size << "\n"; + os << " status: " << static_cast(req.status) << "\n"; + os << " tokens: ["; + for (auto const &token : req.tokens) { + os << token << " "; + } + os << "]\n"; + os << " prompt: " << req.prompt << "\n"; + // os << " beam_trees: ["; + // for (const auto& tree : req.beam_trees) { + // // Assuming BeamTree has its own << operator defined + // os << tree << " "; + // } + // os << "]\n"; + os << " req_type: " << static_cast(req.req_type) << "\n"; + os << " completed_training_steps: " << req.completed_training_steps << "\n"; + os << " max_training_steps: " << req.max_training_steps << "\n"; + os << " dataset_filepath: " << req.dataset_filepath << "\n"; + os << " dataset: ["; + for (auto const &pair : req.dataset) { + os << "["; + for (auto const &token : pair.first) { + os << token << " "; + } + os << "], ["; + for (auto const &token : pair.second) { + os << token << " "; + } + os << "] "; + } + os << "]\n"; + os << "}\n"; + return os; +} + RequestManager::RequestManager() : request_manager_status(INITIALIZED), verbose(false), next_available_guid(1000000), num_processed_requests(0), @@ -240,19 +284,32 @@ RequestManager::RequestGuid Request request; request.status = Request::PENDING; request.guid = next_available_guid++; + request.initial_len = 0; request.max_sequence_length = request_.max_sequence_length; request.peft_model_id = request_.peft_model_id; - request.req_type = Request::REQ_FINETUNING; + request.req_type = RequestType::REQ_FINETUNING; request.completed_training_steps = 0; - request.max_training_steps = 1; // TODO: let user set this - for (auto const &sample : request_.dataset_text) { + request.max_training_steps = request_.max_training_steps; + request.dataset_filepath = request_.dataset_filepath; + + // Load dataset + using json = nlohmann::json; + std::ifstream file_handle(request.dataset_filepath); + assert(file_handle.good() && "Dataset file does not exist."); + json dataset_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + for (auto &prompt : dataset_json) { + std::string text = prompt.get(); + std::string output_text(""); std::vector input_tokens; - input_tokens = this->tokenizer_->Encode(sample.first); + input_tokens = this->tokenizer_->Encode(text); if (bos_token_id >= 0 && model_type != ModelType::FALCON) { input_tokens.insert(input_tokens.begin(), bos_token_id); } - std::vector output_tokens = - this->tokenizer_->Encode(sample.second); + std::vector output_tokens = this->tokenizer_->Encode(output_text); if (input_tokens.size() + output_tokens.size() > get_max_sequence_length()) { std::cout << "Warning: too many tokens in sample, only load up to " @@ -373,7 +430,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; Request &request = all_requests[guid]; - if (request.req_type == Request::REQ_FINETUNING) { + if (request.req_type == RequestType::REQ_FINETUNING) { // No new tokens generated when in fine-tuning mode continue; } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < @@ -403,7 +460,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - if (request.req_type == Request::REQ_FINETUNING) { + if (request.req_type == RequestType::REQ_FINETUNING) { // fine-tuning requests don't automatically carry over to the next // batch, we only do so if there is space left after adding new // inference requests @@ -412,6 +469,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (request.completed_training_steps == request.max_training_steps) { // check if the fine tuning request has completed request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)", old_bc.requestsInfo[i].request_guid, request.completed_training_steps); @@ -562,7 +620,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_infr_request_queue.front(); - assert(new_request.req_type == Request::REQ_INFERENCE); + assert(new_request.req_type == RequestType::REQ_INFERENCE); pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; @@ -604,9 +662,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Step 4: add PEFT bwd requests, if there is additional space while (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); - assert(request.req_type = Request::REQ_FINETUNING); + assert(request.req_type = RequestType::REQ_FINETUNING); Request &all_req_handle = all_requests[request.guid]; - assert(all_req_handle.req_type = Request::REQ_FINETUNING); + assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); if (all_req_handle.status == Request::COMPLETED) { pending_peft_request_queue.pop(); } else { @@ -615,11 +673,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } if (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); - assert(request.req_type = Request::REQ_FINETUNING); + assert(request.req_type = RequestType::REQ_FINETUNING); assert(request.dataset.size() > 0); // update status and training steps Request &all_req_handle = all_requests[request.guid]; - assert(all_req_handle.req_type = Request::REQ_FINETUNING); + assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); request.completed_training_steps = all_req_handle.completed_training_steps; request.status = all_req_handle.status; assert(request.status != Request::COMPLETED); @@ -2410,7 +2468,12 @@ std::vector RequestManager *rm = RequestManager::get_request_manager(); std::vector guids; for (int i = 0; i < requests.size(); i++) { - RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i)); + RequestManager::RequestGuid guid; + if (requests.at(i).req_type == RequestType::REQ_INFERENCE) { + guid = rm->register_new_request(requests.at(i)); + } else { + guid = rm->register_new_peft_request(requests.at(i)); + } if (guid != RequestManager::INVALID_GUID) { guids.push_back(guid); } @@ -2450,6 +2513,18 @@ void RequestManager::background_serving_task( std::vector const ®ions, Context ctx, Runtime *runtime) { + + auto print_timestamped_message = [](std::string const &message) { + auto now = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " + << message << std::endl; + }; + + // Print at the start of the task + print_timestamped_message( + "###PEFT DEBUGGING### Starting background serving task."); + RequestManager *rm = RequestManager::get_request_manager(); FFModel *llm = *(FFModel **)task->args; { @@ -2466,6 +2541,11 @@ void RequestManager::background_serving_task( ssm->config.lg_ctx = ctx; } } + + // Checkpoint print + print_timestamped_message( + "###PEFT DEBUGGING### Updated models' configuration."); + if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding rm->serve_incr_decoding(llm); @@ -2473,6 +2553,10 @@ void RequestManager::background_serving_task( // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); } + + // Print at the end of the task + print_timestamped_message( + "###PEFT DEBUGGING### Background serving task completed."); } std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { @@ -2488,106 +2572,25 @@ std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { bool is_peft_operator_type(OperatorType type) { switch (type) { - case OP_LORA_MLP_FIRST: - case OP_LORA_MLP_SECOND: + case OP_LORA: return true; default: return false; } } -PEFTModelID FFModel::register_peft_model(LoraLinearConfig const mlp_first, - LoraLinearConfig const mlp_second) { - if (!(mlp_first == LoraLinearConfig::DefaultConfig && - mlp_second == LoraLinearConfig::DefaultConfig)) { - if (!config.enable_peft) { - fprintf(stderr, - "Error: trying to register PEFT model, but peft mode is not " - "enabled.\n"); - assert(false); - } - } - PEFTModelID peft_model_id(peft_model_global_guid++); - InferenceManager *im = InferenceManager::get_inference_manager(); - std::vector peft_operators; - for (size_t op = 0; op < operators.size(); op++) { - if (is_peft_operator_type(operators[op]->op_type)) { - peft_operators.push_back(operators[op]); - } else if (operators[op]->op_type == OP_FUSED) { - FusedOp *fused = static_cast(operators[op]); - for (size_t op2 = 0; op2 < fused->numOperators; op2++) { - if (is_peft_operator_type(fused->operators[op2]->op_type)) { - peft_operators.push_back(fused->operators[op2]); - } - } - } - } - for (size_t op = 0; op < peft_operators.size(); op++) { - std::string layer_name = - find_layer_name_from_guid(this, peft_operators[op]->layer_guid); - switch (peft_operators[op]->op_type) { - case OP_LORA_MLP_FIRST: { - if (mlp_first == LoraLinearConfig::DefaultConfig) { - // Do nothing for the default configuration - continue; - } - LoraLinear *lora = static_cast(peft_operators[op]); - // Currently assume only a single data pipeline - assert(config.data_parallelism_degree == 1); - std::vector inputs(lora->numInputs); - std::vector outputs(lora->numOutputs); - - for (int i = 0; i < lora->numInputs; i++) { - assert(im->tensor_buffer.find(lora->inputs[i]) != - im->tensor_buffer.end()); - assert(lora->inputs[i] != nullptr); - assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(im->tensor_buffer[lora->inputs[i]].size() == 1); - inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; - assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); - } - assert(lora->numOutputs == 1); - outputs[0] = inputs[1]; - lora->register_peft_model( - *this, inputs, outputs, peft_model_id, mlp_first); - break; - } - case OP_LORA_MLP_SECOND: { - if (mlp_second == LoraLinearConfig::DefaultConfig) { - // Do nothing for the default configuration - continue; - } - LoraLinear *lora = static_cast(peft_operators[op]); - // Currently assume only a single data pipeline - assert(config.data_parallelism_degree == 1); - std::vector inputs(lora->numInputs); - std::vector outputs(lora->numOutputs); - - for (int i = 0; i < lora->numInputs; i++) { - assert(im->tensor_buffer.find(lora->inputs[i]) != - im->tensor_buffer.end()); - assert(lora->inputs[i] != nullptr); - assert(lora->inputs[i]->parallel_is != IndexSpace::NO_SPACE); - assert(im->tensor_buffer[lora->inputs[i]].size() == 1); - inputs[i] = im->tensor_buffer[lora->inputs[i]][0]; - assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); - } - assert(lora->numOutputs == 1); - outputs[0] = inputs[1]; - lora->register_peft_model( - *this, inputs, outputs, peft_model_id, mlp_second); - break; - } - default: { - assert(false && "Unsupported PEFT Operator type"); - } - } - } - return peft_model_id; -} - /*static*/ void RequestManager::serve_incr_decoding(FFModel *llm) { + + // Check if the model object exists + if (llm == nullptr) { + std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl; + } + Context ctx = llm->config.lg_ctx; Runtime *runtime = llm->config.lg_hlr; // Compile the llm diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py index 1fde4d5a50..7bfc560cc2 100644 --- a/tests/peft/hf_serve.py +++ b/tests/peft/hf_serve.py @@ -1,6 +1,6 @@ import argparse import torch -import os, sys, shutil +import os, sys, shutil, json from peft import PeftModel, PeftConfig from transformers import ( AutoModelForCausalLM, @@ -40,11 +40,12 @@ def peft_post_forward_hook(module, input, output): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--peft-model-id", type=str, default="./finetuned-llama") + parser.add_argument("--peft-model-id", type=str, required=True) parser.add_argument( "--use-full-precision", action="store_true", help="Use full precision" ) - parser.add_argument("--max-new-tokens", type=int, default=50) + parser.add_argument("--max-length", type=int, default=50) + parser.add_argument("--prompt-file", type=str, required=True) parser.add_argument("--do-sample", action="store_true", help="Use sampling") parser.add_argument( "--save-peft-tensors", @@ -52,24 +53,28 @@ def main(): help="Save PEFT hidden states and weights to file", ) args = parser.parse_args() - peft_model_id = args.peft_model_id - use_full_precision = args.use_full_precision - max_new_tokens = args.max_new_tokens - save_peft_tensors = args.save_peft_tensors - # Change working dir to folder storing this script - abspath = os.path.abspath(__file__) - dname = os.path.dirname(abspath) - os.chdir(dname) + # Check if prompt-file exists + if not os.path.isfile(args.prompt_file): + print(f"Error: {args.prompt_file} does not exist.") + return - config = PeftConfig.from_pretrained(peft_model_id) + # Get peft model config + config = PeftConfig.from_pretrained(args.peft_model_id) + + # Load the base model model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, return_dict=True, # load_in_8bit=True, - torch_dtype=torch.float32 if use_full_precision else torch.float16, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, device_map="auto", ) + # Load the Lora model + model = PeftModel.from_pretrained(model, args.peft_model_id) + print(model) + + # Get tokenizer hf_config = AutoConfig.from_pretrained( config.base_model_name_or_path, trust_remote_code=True ) @@ -78,25 +83,26 @@ def main(): tokenizer = LlamaTokenizer.from_pretrained( config.base_model_name_or_path, use_fast=True, - torch_dtype=torch.float32 if use_full_precision else torch.float16, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, ) else: tokenizer = AutoTokenizer.from_pretrained( config.base_model_name_or_path, - torch_dtype=torch.float32 if use_full_precision else torch.float16, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, ) + # Generation config generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path) generation_config.do_sample = args.do_sample - # Load the Lora model - model = PeftModel.from_pretrained(model, peft_model_id) - - print(model) # Register hooks to save tensors, if needed - if save_peft_tensors: + if args.save_peft_tensors: + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + # Create output dir shutil.rmtree("./hf_peft_tensors") - # Check that the output folder exists os.makedirs("./hf_peft_tensors", exist_ok=True) # Save weights for name, params in model.named_parameters(): @@ -112,12 +118,22 @@ def main(): layer.register_forward_pre_hook(peft_pre_forward_hook) layer.register_forward_hook(peft_post_forward_hook) - batch = tokenizer("Two things are infinite: ", return_tensors="pt") - with torch.cuda.amp.autocast(): - output_tokens = model.generate( - **batch, max_new_tokens=max_new_tokens, generation_config=generation_config - ) - print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) + # Run inference + # Read prompt-file into a list of strings + with open(args.prompt_file, "r") as f: + try: + prompt_list = json.load(f) + except json.JSONDecodeError: + print(f"Error: Unable to parse {args.prompt_file} as JSON.") + sys.exit(1) + + for i, prompt in enumerate(prompt_list): + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) + with torch.cuda.amp.autocast(): + output_tokens = model.generate( + **batch, max_new_tokens=args.max_length, generation_config=generation_config + ) + print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) if __name__ == "__main__": diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 29b3e6520c..9b4a5204ac 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -25,4 +25,8 @@ export LEGION_BACKTRACE=1 python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m # if first time, add: --refresh-cache -./inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft +# CPP test +../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft + +# Python test +python ../inference/python/ff_peft.py From 0ed889af28ce05ae2862b1d905085744492911cc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 7 Apr 2024 20:48:20 -0700 Subject: [PATCH 164/198] fix --- include/flexflow/fftype.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 099b58c82e..3e482b8d67 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -4,6 +4,7 @@ #include "flexflow/ffconst.h" #include #include +#include namespace FlexFlow { From 48c431a393beec8902f59e5839379e4e6d6b8999 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 11 Apr 2024 14:20:31 -0700 Subject: [PATCH 165/198] update --- include/flexflow/request_manager.h | 1 + inference/peft/peft.cc | 66 ++++++++++----- src/runtime/file_loader.cc | 5 +- src/runtime/request_manager.cc | 132 ++++++++++++++++++++--------- src/runtime/request_manager.cu | 15 +++- 5 files changed, 153 insertions(+), 66 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index cbd0b3ad05..f3538c1c68 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -80,6 +80,7 @@ struct Request { RequestType req_type = REQ_INFERENCE; int completed_training_steps = 0; int max_training_steps = 1; + int benchmarking_tokens = -1; std::string dataset_filepath; std::vector, std::vector>> diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index eade2eaeeb..a6fd3b99b0 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -49,7 +49,8 @@ void parse_input_args(char **argv, float &topp, int &max_requests_per_batch, int &max_tokens_per_batch, - int &max_sequence_length) { + int &max_sequence_length, + int &max_requests_to_run) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -118,6 +119,10 @@ void parse_input_args(char **argv, max_sequence_length = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "--max-requests-to-run")) { + max_requests_to_run = std::stoi(argv[++i]); + continue; + } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -148,6 +153,7 @@ void FlexFlow::top_level_task(Task const *task, int max_requests_per_batch = 8; int max_tokens_per_batch = 128; int max_sequence_length = 256; + int max_requests_to_run = 1000000000; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -165,7 +171,8 @@ void FlexFlow::top_level_task(Task const *task, topp, max_requests_per_batch, max_tokens_per_batch, - max_sequence_length); + max_sequence_length, + max_requests_to_run); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -301,27 +308,42 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - // for (auto &prompt : prompt_json) { - // std::string text = prompt.get(); - // printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); - // Request inference_req; - // inference_req.prompt = text; - // inference_req.max_sequence_length = 128; - // inference_req.peft_model_id = peft_model_id; - // requests.push_back(inference_req); - // total_num_requests++; - // } + std::vector> prompts; + int index = 0; + for (auto &entry : prompt_json) { + if (index >= max_requests_to_run) { + break; + } + int prompt_length = entry["human"]; + int sequence_length = entry["gpt"]; + assert(prompt_length + sequence_length <= max_sequence_length && + "Prompt + sequence length exceeds max sequence length"); + prompts.push_back(std::make_pair(prompt_length, sequence_length)); + index++; + } + printf("Total number of prompts: %d", prompts.size()); + for (auto &prompt : prompts) { + // printf("Prompt length: %d, sequence length: %d\n", prompt_length, + // sequence_length); + Request inference_req; + inference_req.benchmarking_tokens = prompt.first; + inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + total_num_requests++; + } - // Add fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = 128; - fine_tuning_req.peft_model_id = - (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; - fine_tuning_req.dataset_filepath = file_paths.prompt_file_path; - fine_tuning_req.max_training_steps = 1; - requests.push_back(fine_tuning_req); - total_num_requests++; + // // Add fine-tuning request + // Request fine_tuning_req; + // fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + // fine_tuning_req.max_sequence_length = 128; + // fine_tuning_req.peft_model_id = + // (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path; + // fine_tuning_req.max_training_steps = 1; + // requests.push_back(fine_tuning_req); + // total_num_requests++; std::vector result = model.generate(requests); } diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index ed88dc0a99..fd31f21b26 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -759,7 +759,8 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, ? ".attn_bias" : ((weight_idx == 1) ? ".weight" : ".bias"); std::cout << "Loading weight file " << weight_filename << std::endl; - std::string weight_filepath = join_path({weights_folder, weight_filename}); + std::string weight_filepath = + join_path({weights_folder, weight_filename}); load_from_file(data, volume, weight_filepath); } else { // default op @@ -769,7 +770,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } } - } + } // Copy the weight data from the buffer to the weight's ParallelTensor ParallelTensor weight_pt; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index ef7068e330..5ec230298a 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -239,17 +239,26 @@ RequestManager::RequestGuid if (bos_token_id >= 0 && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } - std::vector tokens = this->tokenizer_->Encode(request_.prompt); - if (tokens.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; - return INVALID_GUID; - } - for (int i = 0; i < tokens.size(); i++) { - std::cout << "[" << i << "]" << tokens.at(i) << "\n"; + if (request_.benchmarking_tokens >= 0) { + assert(request_.benchmarking_tokens < get_max_sequence_length()); + request.benchmarking_tokens = request_.benchmarking_tokens; + request.tokens.insert(request.tokens.end(), + request_.benchmarking_tokens, + 15); // insert random number + } else { + std::vector tokens = this->tokenizer_->Encode(request_.prompt); + if (tokens.size() >= get_max_sequence_length()) { + std::cout << "Warning: too many tokens in prompt, only load up to " + << get_max_sequence_length() << " tokens, but got " + << tokens.size() << ".\n"; + return INVALID_GUID; + } + for (int i = 0; i < tokens.size(); i++) { + std::cout << "[" << i << "]" << tokens.at(i) << "\n"; + } + request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); } - request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); + request.initial_len = request.tokens.size(); if (get_num_ssms() == 0) { @@ -558,20 +567,27 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; - } - } - outputFile << std::endl; - outputFile << output; + outputFile << "[Profile] guid(" << request.guid + << ") llm_decoding_steps(" + << profile_info.llm_decoding_steps << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ")\n"; + // outputFile << "end-to-end latency: " << std::fixed + // << std::setprecision(3) << total_request_run_time + // << std::endl; + // outputFile << "num decoding steps: " + // << profile_info.llm_decoding_steps << std::endl; + // outputFile << "token IDs: "; + // for (int i = 0; i < request.tokens.size(); i++) { + // outputFile << request.tokens[i]; + // if (i < request.tokens.size() - 1) { + // outputFile << ","; + // } + // } + // outputFile << std::endl; + // outputFile << output; + // outputFile << std::endl; outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath @@ -603,8 +619,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase + assert(old_bc.requestsInfo[i].prompt_phase == true); + int space_for_incr_dec_requests = 0; + for (int ii = i + 1; i < BatchConfig::max_requests_per_batch(); + ii++) { + if (!old_bc.request_completed[ii] && + !old_bc.requestsInfo[ii].prompt_phase) { + space_for_incr_dec_requests++; + } + } new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - get_max_tokens_per_batch() - new_bc.num_tokens, + get_max_tokens_per_batch() - new_bc.num_tokens - + space_for_incr_dec_requests, (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); new_bc.requestsInfo[i].prompt_phase = true; @@ -733,7 +759,25 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } } - + // pid_t pid = getpid(); + // std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt"; + // std::ofstream filen(filenamen); + // if (filen.is_open()) { + // filen << new_bc << std::endl; + // filen.close(); + // std::cout << "String written to file: " << filenamen << std::endl; + // } else { + // std::cout << "Unable to open file: " << filenamen << std::endl; + // } + // std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt"; + // std::ofstream fileo(filenameo); + // if (fileo.is_open()) { + // fileo << old_bc << std::endl; + // fileo.close(); + // std::cout << "String written to file: " << filenameo << std::endl; + // } else { + // std::cout << "Unable to open file: " << filenameo << std::endl; + // } return new_bc; } @@ -905,21 +949,27 @@ BeamSearchBatchConfig if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; - } - } - outputFile << std::endl; - outputFile << output; - + outputFile << "[Profile] guid(" << request.guid + << ") llm_decoding_steps(" + << profile_info.llm_decoding_steps << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ")\n"; + // outputFile << "end-to-end latency: " << std::fixed + // << std::setprecision(3) << total_request_run_time + // << std::endl; + // outputFile << "num decoding steps: " + // << profile_info.llm_decoding_steps << std::endl; + // outputFile << "token IDs: "; + // for (int i = 0; i < request.tokens.size(); i++) { + // outputFile << request.tokens[i]; + // if (i < request.tokens.size() - 1) { + // outputFile << ","; + // } + // } + // outputFile << std::endl; + // outputFile << output; + // outputFile << std::endl; outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 8380d6be73..235d435580 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -40,8 +40,21 @@ void RequestManager::load_tokens_task( printf("Warning: too many tokens in prompt, only load up to %d tokens\n", BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); + + // pid_t pid = getpid(); + // std::string filename = "bc_" + std::to_string(pid) + ".txt"; + // std::ofstream file(filename); + // if (file.is_open()) { + // file << *batch_config << std::endl; + // file.close(); + // std::cout << "String written to file: " << filename << std::endl; + // } else { + // std::cout << "Unable to open file: " << filename << std::endl; + // } + } else if (batch_config->num_tokens > - BatchConfig::max_verify_tokens_per_batch()) { + BatchConfig::max_verify_tokens_per_batch() && + batch_config->get_mode() != INC_DECODING_MODE) { printf("Warning: Speculative decoding. too many tokens in prompt, only " "load up to %d tokens\n", BatchConfig::max_verify_tokens_per_batch()); From 40649ee25a2ea36e25b55f37319777c95158af6d Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 11 Apr 2024 17:05:16 -0700 Subject: [PATCH 166/198] fix --- inference/utils/download_peft_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index ad79816f84..596612d8d7 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -1,6 +1,6 @@ #!/usr/bin/env python import flexflow.serve as ff -import argparse +import argparse, os def parse_args(): @@ -15,7 +15,7 @@ def parse_args(): "--cache-folder", type=str, help="Folder to use to store the model(s) assets in FlexFlow format", - default="", + default=os.environ.get("FF_CACHE_PATH", ""), ) parser.add_argument( "--refresh-cache", From 0580d7e6b1ce34048e3e6fbb9572ebc0461c7d14 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 13 Apr 2024 11:52:17 -0700 Subject: [PATCH 167/198] fix to support prompts larger than max tokens per batch --- include/flexflow/batch_config.h | 4 +- include/flexflow/request_manager.h | 2 + src/runtime/batch_config.cc | 6 ++ src/runtime/request_manager.cc | 99 ++++++++++++++++++++---------- 4 files changed, 79 insertions(+), 32 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 3aebfe908d..28fca9067a 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -77,6 +77,8 @@ class BatchConfig { num_tokens_in_batch = 0; max_sequence_length = 0; request_guid = 0; + prompt_phase = false; + batch_config_request_id = -1; peft_model_id = PEFTModelID::NO_ID; peft_bwd = false; } @@ -86,7 +88,7 @@ class BatchConfig { int max_sequence_length; // request id in batch config: - int batch_config_request_id; + int batch_config_request_id = -1; bool prompt_phase = false; RequestGuid request_guid; // PEFT fields diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index f3538c1c68..a7e67487bb 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -167,6 +167,8 @@ class RequestManager { bool is_request_completed(RequestGuid const &guid); void trigger_request_completion_future(RequestGuid const &guid); // Methods for preparing next batches + bool check_inf_req_completion(BatchConfig const &old_bc, int i); + void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 588ed61802..027ca7f5c0 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -120,6 +120,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { << std::endl; os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl; os << "Number of requests: " << bc.num_active_requests() << std::endl; + os << "Number of generation tokens: " << bc.num_generation_tokens + << std::endl; // Per-request info os << "Per-request info:\n"; @@ -133,6 +135,10 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + os << " Prompt phase: " << bc.requestsInfo[i].prompt_phase + << std::endl; + os << " BatchConfig Req ID: " + << bc.requestsInfo[i].batch_config_request_id << std::endl; // PEFT values os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5ec230298a..7eb9be598f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -444,6 +444,62 @@ BatchConfig RequestManager::prepare_next_batch_task( return rm->prepare_next_batch(*bc, result); } +bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc, + int i) { + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + bool request_completed = false; + // printf("model_type = %d\n", this->model_type); + if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + request_completed = true; + } else if (request.tokens.back() == eos_token_id) { + // Encounter EOS token id + request_completed = true; + } + return request_completed; +} + +void RequestManager::check_batch(BatchConfig const &old_bc, + BatchConfig const &new_bc) { + int num_incomplete_prompts = 0; + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + if (new_bc.request_completed[i]) { + continue; + } + // ensure there is no request with zero tokens + assert(new_bc.requestsInfo[i].num_tokens_in_batch > 0); + // ensure there is no more than one incomplete prompt + if (new_bc.requestsInfo[i].prompt_phase && + new_bc.requestsInfo[i].num_tokens_in_batch + + new_bc.requestsInfo[i].first_token_depth_in_request < + all_requests[new_bc.requestsInfo[i].request_guid].tokens.size()) { + num_incomplete_prompts++; + } + } + if (num_incomplete_prompts > 1) { + std::cout << "Error: more than one incomplete prompt in the batch\n"; + pid_t pid = getpid(); + std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt"; + std::ofstream filen(filenamen); + if (filen.is_open()) { + filen << new_bc << std::endl; + filen.close(); + std::cout << "String written to file: " << filenamen << std::endl; + } else { + std::cout << "Unable to open file: " << filenamen << std::endl; + } + std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt"; + std::ofstream fileo(filenameo); + if (fileo.is_open()) { + fileo << old_bc << std::endl; + fileo.close(); + std::cout << "String written to file: " << filenameo << std::endl; + } else { + std::cout << "Unable to open file: " << filenameo << std::endl; + } + assert(false); + } +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); @@ -518,15 +574,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].first_token_depth_in_request + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); - bool request_completed = false; - // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= - old_bc.requestsInfo[i].max_sequence_length) { - request_completed = true; - } else if (request.tokens.back() == eos_token_id) { - // Encounter EOS token id - request_completed = true; - } + bool request_completed = check_inf_req_completion(old_bc, i); if (request_completed) { std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically @@ -621,10 +669,18 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Prompt phase assert(old_bc.requestsInfo[i].prompt_phase == true); int space_for_incr_dec_requests = 0; - for (int ii = i + 1; i < BatchConfig::max_requests_per_batch(); + // If the prompt can't fit in the batch, compute how much space we + // need to leave out for incomplete requests in decoding phase at + // higher indices. + for (int ii = i + 1; ii < BatchConfig::max_requests_per_batch(); ii++) { - if (!old_bc.request_completed[ii] && - !old_bc.requestsInfo[ii].prompt_phase) { + if (old_bc.request_completed[ii]) { + continue; + } + Request &old_request = + all_requests[old_bc.requestsInfo[ii].request_guid]; + bool req_completed = check_inf_req_completion(old_bc, ii); + if (!req_completed) { space_for_incr_dec_requests++; } } @@ -759,25 +815,6 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } } - // pid_t pid = getpid(); - // std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt"; - // std::ofstream filen(filenamen); - // if (filen.is_open()) { - // filen << new_bc << std::endl; - // filen.close(); - // std::cout << "String written to file: " << filenamen << std::endl; - // } else { - // std::cout << "Unable to open file: " << filenamen << std::endl; - // } - // std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt"; - // std::ofstream fileo(filenameo); - // if (fileo.is_open()) { - // fileo << old_bc << std::endl; - // fileo.close(); - // std::cout << "String written to file: " << filenameo << std::endl; - // } else { - // std::cout << "Unable to open file: " << filenameo << std::endl; - // } return new_bc; } From 0affe2748d13fa4109e814af725d15fb551f9bee Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 13 Apr 2024 21:09:27 -0700 Subject: [PATCH 168/198] fixes to support benchmarking of finetuning throughput --- include/flexflow/batch_config.h | 2 +- include/flexflow/request_manager.h | 10 + inference/peft/peft.cc | 43 ++- src/runtime/request_manager.cc | 522 ++++++++++++++++------------- 4 files changed, 333 insertions(+), 244 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 28fca9067a..ade519cd38 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -59,7 +59,7 @@ class BatchConfig { // Maximum possible values for different parameters // These maximum values are used for copying BatchConfig // across workers - static int const MAX_NUM_REQUESTS = 64; + static int const MAX_NUM_REQUESTS = 65; static int const MAX_NUM_TOKENS = 1024; static int const MAX_SPEC_TREE_TOKEN_NUM = 64; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index a7e67487bb..524d4828ec 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -78,7 +78,9 @@ struct Request { std::vector beam_trees; // PEFT field RequestType req_type = REQ_INFERENCE; + size_t processed_finetuning_tokens = 0; int completed_training_steps = 0; + int dataset_entry_processed_tokens = 0; int max_training_steps = 1; int benchmarking_tokens = -1; std::string dataset_filepath; @@ -132,6 +134,9 @@ class RequestManager { void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); + void set_enable_peft_finetuning(bool enable_peft_finetuning_); + void set_disable_peft_bwd(bool disable_peft_bwd_); + static void set_inference_finished(); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, int bos_token_id, @@ -278,6 +283,11 @@ class RequestManager { int max_sequence_length; Status request_manager_status; + // peft benchmarking + bool enable_peft_finetuning = false; + bool disable_peft_bwd = false; + static bool inference_finished; + // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index a6fd3b99b0..5c96709be7 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -50,7 +50,9 @@ void parse_input_args(char **argv, int &max_requests_per_batch, int &max_tokens_per_batch, int &max_sequence_length, - int &max_requests_to_run) { + int &max_requests_to_run, + bool &enable_peft_finetuning, + bool &disable_peft_bwd) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -123,6 +125,14 @@ void parse_input_args(char **argv, max_requests_to_run = std::stoi(argv[++i]); continue; } + if (!strcmp(argv[i], "-enable-peft-finetuning")) { + enable_peft_finetuning = true; + continue; + } + if (!strcmp(argv[i], "-disable-peft-bwd")) { + disable_peft_bwd = true; + continue; + } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -154,6 +164,8 @@ void FlexFlow::top_level_task(Task const *task, int max_tokens_per_batch = 128; int max_sequence_length = 256; int max_requests_to_run = 1000000000; + bool enable_peft_finetuning = false; + bool disable_peft_bwd = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -172,7 +184,9 @@ void FlexFlow::top_level_task(Task const *task, max_requests_per_batch, max_tokens_per_batch, max_sequence_length, - max_requests_to_run); + max_requests_to_run, + enable_peft_finetuning, + disable_peft_bwd); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -242,12 +256,16 @@ void FlexFlow::top_level_task(Task const *task, GenerationConfig generationConfig(do_sample, temperature, topp); RequestManager *rm = RequestManager::get_request_manager(); - rm->set_max_requests_per_batch(max_requests_per_batch); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( model_type, bos_token_id, eos_token_id, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + rm->set_disable_peft_bwd(disable_peft_bwd); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { @@ -334,16 +352,17 @@ void FlexFlow::top_level_task(Task const *task, total_num_requests++; } - // // Add fine-tuning request - // Request fine_tuning_req; - // fine_tuning_req.req_type = RequestType::REQ_FINETUNING; - // fine_tuning_req.max_sequence_length = 128; - // fine_tuning_req.peft_model_id = - // (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path; - // fine_tuning_req.max_training_steps = 1; - // requests.push_back(fine_tuning_req); - // total_num_requests++; + fine_tuning_req.max_training_steps = 1000000000; + requests.push_back(fine_tuning_req); + total_num_requests++; std::vector result = model.generate(requests); } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 7eb9be598f..b8ca019d3f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -89,6 +89,8 @@ std::ostream &operator<<(std::ostream &os, Request const &req) { return os; } +bool RequestManager::inference_finished = false; + RequestManager::RequestManager() : request_manager_status(INITIALIZED), verbose(false), next_available_guid(1000000), num_processed_requests(0), @@ -160,6 +162,18 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) { spec_infer_tree_width.emplace_back(tree_width); } +void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) { + enable_peft_finetuning = enable_peft_finetuning_; +} + +void RequestManager::set_disable_peft_bwd(bool disable_peft_bwd_) { + disable_peft_bwd = disable_peft_bwd_; +} + +void RequestManager::set_inference_finished() { + inference_finished = true; +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -315,31 +329,40 @@ RequestManager::RequestGuid request.dataset_filepath = request_.dataset_filepath; // Load dataset - using json = nlohmann::json; - std::ifstream file_handle(request.dataset_filepath); - assert(file_handle.good() && "Dataset file does not exist."); - json dataset_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - - for (auto &prompt : dataset_json) { - std::string text = prompt.get(); - std::string output_text(""); - std::vector input_tokens; - input_tokens = this->tokenizer_->Encode(text); - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { - input_tokens.insert(input_tokens.begin(), bos_token_id); - } - std::vector output_tokens = this->tokenizer_->Encode(output_text); - if (input_tokens.size() + output_tokens.size() > - get_max_sequence_length()) { - std::cout << "Warning: too many tokens in sample, only load up to " - << get_max_sequence_length() << " tokens, but got " - << input_tokens.size() + output_tokens.size() << ".\n"; - return INVALID_GUID; - } else { - request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); + if (request_.benchmarking_tokens >= 0) { + assert(request_.benchmarking_tokens == get_max_sequence_length()); + request.benchmarking_tokens = request_.benchmarking_tokens; + request.tokens.insert(request.tokens.end(), + request_.benchmarking_tokens, + 15); // insert random number + } else { + using json = nlohmann::json; + std::ifstream file_handle(request.dataset_filepath); + assert(file_handle.good() && "Dataset file does not exist."); + json dataset_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + for (auto &prompt : dataset_json) { + std::string text = prompt.get(); + std::string output_text(""); + std::vector input_tokens; + input_tokens = this->tokenizer_->Encode(text); + if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + input_tokens.insert(input_tokens.begin(), bos_token_id); + } + std::vector output_tokens = + this->tokenizer_->Encode(output_text); + if (input_tokens.size() + output_tokens.size() > + get_max_sequence_length()) { + std::cout << "Warning: too many tokens in sample, only load up to " + << get_max_sequence_length() << " tokens, but got " + << input_tokens.size() + output_tokens.size() << ".\n"; + return INVALID_GUID; + } else { + request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); + } } } @@ -504,15 +527,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens - for (int i = 0; i < old_bc.num_tokens; i++) { + for (int i = 0; i < old_bc.num_active_infr_tokens(); i++) { size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; Request &request = all_requests[guid]; - if (request.req_type == RequestType::REQ_FINETUNING) { - // No new tokens generated when in fine-tuning mode - continue; - } else if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < - request.tokens.size()) { + assert(request.req_type == RequestType::REQ_INFERENCE && + "Found misplaced finetuning request"); + if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { // This is a prompt token continue; } else { @@ -525,192 +546,146 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // log_req_mgr.print("Output: %s", output.c_str()); } } + int num_generation_tokens = 0; int num_active_req = -1; + // when finetuning is enabled, the last entry in the batch cannot be used for + // inference + int inference_batch_size = + BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning; + // Step 2: prepare the next batch for existing requests BatchConfig new_bc; - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + for (int i = 0; i < inference_batch_size; i++) { if (old_bc.request_completed[i]) { // no need to carry over tokens to new batch for this request continue; } else { assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; - - if (request.req_type == RequestType::REQ_FINETUNING) { - // fine-tuning requests don't automatically carry over to the next - // batch, we only do so if there is space left after adding new - // inference requests - request.completed_training_steps += 1; - assert(request.completed_training_steps <= request.max_training_steps); - if (request.completed_training_steps == request.max_training_steps) { - // check if the fine tuning request has completed - request.status = Request::COMPLETED; - trigger_request_completion_future(request.guid); - log_req_mgr.print("[Done] guid(%zu) completed_training_steps(%d)", - old_bc.requestsInfo[i].request_guid, - request.completed_training_steps); + assert(request.req_type == RequestType::REQ_INFERENCE && + "Found misplaced finetuning request"); + + int processed_tokens = + old_bc.requestsInfo[i].first_token_depth_in_request + + old_bc.requestsInfo[i].num_tokens_in_batch; + assert(processed_tokens < request.tokens.size()); + bool request_completed = check_inf_req_completion(old_bc, i); + if (request_completed) { + std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically + // removes the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } + { + // update generation result GenerationResult &gr = request_generation_results[request.guid]; assert(gr.guid == request.guid); - num_processed_requests++; - ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = - Realm::Clock::current_time_in_microseconds(); - total_request_run_time += - profile_info.finish_time - profile_info.start_time; - profiling_requests[request.guid] = profile_info; - log_req_mgr.print( - "[Profile] guid(%zu) completed_training_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - request.completed_training_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + gr.output_tokens = request.tokens; + gr.output_text = output; } - } else { - int processed_tokens = - old_bc.requestsInfo[i].first_token_depth_in_request + - old_bc.requestsInfo[i].num_tokens_in_batch; - assert(processed_tokens < request.tokens.size()); - bool request_completed = check_inf_req_completion(old_bc, i); - if (request_completed) { - std::string output = this->tokenizer_->Decode(request.tokens); - // Unlike Huggingface, the sentencepiece C++ library automatically - // removes the BOS token - if (model_type == ModelType::LLAMA && - request.tokens.at(0) == bos_token_id) { - output = " " + output; - } - { - // update generation result - GenerationResult &gr = request_generation_results[request.guid]; - assert(gr.guid == request.guid); - gr.output_tokens = request.tokens; - gr.output_text = output; - } - request.status = Request::COMPLETED; - trigger_request_completion_future(request.guid); - log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", - old_bc.requestsInfo[i].request_guid, - request.tokens.size()); - log_req_mgr.print("Final output: %s", output.c_str()); - num_processed_requests++; - ProfileInfo profile_info = profiling_requests[request.guid]; - profile_info.finish_time = - Realm::Clock::current_time_in_microseconds(); - total_request_run_time += - profile_info.finish_time - profile_info.start_time; - profiling_requests[request.guid] = profile_info; - log_req_mgr.print( - "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.llm_decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); - // Write output to file if needed: - if (!output_filepath.empty()) { - std::ofstream outputFile(output_filepath, std::ios::app); - if (outputFile.is_open()) { - outputFile << "[Profile] guid(" << request.guid - << ") llm_decoding_steps(" - << profile_info.llm_decoding_steps << ") latency(" - << std::fixed << std::setprecision(3) - << (profile_info.finish_time - profile_info.start_time) - << ")\n"; - // outputFile << "end-to-end latency: " << std::fixed - // << std::setprecision(3) << total_request_run_time - // << std::endl; - // outputFile << "num decoding steps: " - // << profile_info.llm_decoding_steps << std::endl; - // outputFile << "token IDs: "; - // for (int i = 0; i < request.tokens.size(); i++) { - // outputFile << request.tokens[i]; - // if (i < request.tokens.size() - 1) { - // outputFile << ","; - // } - // } - // outputFile << std::endl; - // outputFile << output; - // outputFile << std::endl; - outputFile.close(); - } else { - std::cout << "Unable to open the output file: " << output_filepath - << std::endl; - assert(false); - } + request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); + log_req_mgr.print("[Done] guid(%zu) final_length(%zu)", + old_bc.requestsInfo[i].request_guid, + request.tokens.size()); + log_req_mgr.print("Final output: %s", output.c_str()); + num_processed_requests++; + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print( + "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf)", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time); + // Write output to file if needed: + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath, std::ios::app); + if (outputFile.is_open()) { + outputFile << "[Profile] guid(" << request.guid + << ") llm_decoding_steps(" + << profile_info.llm_decoding_steps << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ")\n"; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); } - + } + } else { + new_bc.request_completed[i] = false; + new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; + new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; + new_bc.requestsInfo[i].request_guid = + old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; + new_bc.requestsInfo[i].max_sequence_length = + old_bc.requestsInfo[i].max_sequence_length; + num_active_req++; + new_bc.requestsInfo[num_active_req].batch_config_request_id = i; + if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == + request.tokens.size()) { + // Incremental phase + new_bc.requestsInfo[i].num_tokens_in_batch = 1; + num_generation_tokens++; + new_bc.requestsInfo[i].prompt_phase = false; } else { - new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].first_token_depth_in_request = - processed_tokens; - new_bc.requestsInfo[i].first_token_offset_in_batch = - new_bc.num_tokens; - new_bc.requestsInfo[i].request_guid = - old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].peft_model_id = - old_bc.requestsInfo[i].peft_model_id; - new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; - num_active_req++; - new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == - request.tokens.size()) { - // Incremental phase - new_bc.requestsInfo[i].num_tokens_in_batch = 1; - num_generation_tokens++; - new_bc.requestsInfo[i].prompt_phase = false; - } else { - // Prompt phase - assert(old_bc.requestsInfo[i].prompt_phase == true); - int space_for_incr_dec_requests = 0; - // If the prompt can't fit in the batch, compute how much space we - // need to leave out for incomplete requests in decoding phase at - // higher indices. - for (int ii = i + 1; ii < BatchConfig::max_requests_per_batch(); - ii++) { - if (old_bc.request_completed[ii]) { - continue; - } - Request &old_request = - all_requests[old_bc.requestsInfo[ii].request_guid]; - bool req_completed = check_inf_req_completion(old_bc, ii); - if (!req_completed) { - space_for_incr_dec_requests++; - } + // Prompt phase + assert(old_bc.requestsInfo[i].prompt_phase == true); + int space_for_incr_dec_requests = 0; + // If the prompt can't fit in the batch, compute how much space we + // need to leave out for incomplete requests in decoding phase at + // higher indices. + for (int ii = i + 1; ii < inference_batch_size; ii++) { + if (old_bc.request_completed[ii]) { + continue; + } + Request &old_request = + all_requests[old_bc.requestsInfo[ii].request_guid]; + bool req_completed = check_inf_req_completion(old_bc, ii); + if (!req_completed) { + space_for_incr_dec_requests++; } - new_bc.requestsInfo[i].num_tokens_in_batch = std::min( - get_max_tokens_per_batch() - new_bc.num_tokens - - space_for_incr_dec_requests, - (int)request.tokens.size() - - new_bc.requestsInfo[i].first_token_depth_in_request); - new_bc.requestsInfo[i].prompt_phase = true; - } - for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { - int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; - new_bc.tokensInfo[new_bc.num_tokens].request_index = i; - new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; - assert(depth < request.tokens.size()); - new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.tokens[depth]; - new_bc.num_tokens++; } - // Update profiling - profiling_requests[new_bc.requestsInfo[i].request_guid] - .llm_decoding_steps++; + new_bc.requestsInfo[i].num_tokens_in_batch = + std::min(get_max_tokens_per_batch() - new_bc.num_tokens - + space_for_incr_dec_requests, + (int)request.tokens.size() - + new_bc.requestsInfo[i].first_token_depth_in_request); + new_bc.requestsInfo[i].prompt_phase = true; + } + for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { + int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; + new_bc.tokensInfo[new_bc.num_tokens].request_index = i; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth; + assert(depth < request.tokens.size()); + new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth]; + new_bc.num_tokens++; } + // Update profiling + profiling_requests[new_bc.requestsInfo[i].request_guid] + .llm_decoding_steps++; } } } new_bc.num_generation_tokens = num_generation_tokens; // Step 3: add new requests to the next batch if there is space - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + for (int i = 0; i < inference_batch_size; i++) { if (new_bc.request_completed[i]) { if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { @@ -754,65 +729,143 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } + if (enable_peft_finetuning && + !old_bc.request_completed[inference_batch_size]) { + assert(old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch > 0); + Request &request = + all_requests[old_bc.requestsInfo[inference_batch_size].request_guid]; + assert(request.req_type == RequestType::REQ_FINETUNING && + "Found misplaced inference request"); + + request.dataset_entry_processed_tokens += + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; + request.processed_finetuning_tokens += + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; + int dataset_entry = + request.completed_training_steps % request.dataset.size(); + if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request + + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch == + request.dataset[dataset_entry].first.size()) { + // completed the current dataset entry + assert(request.dataset_entry_processed_tokens == + request.dataset[dataset_entry].first.size()); + request.completed_training_steps += 1; + request.dataset_entry_processed_tokens = 0; + } + + assert(request.completed_training_steps <= request.max_training_steps); + if (request.completed_training_steps == request.max_training_steps || + inference_finished) { + // check if the fine tuning request has completed + request.status = Request::COMPLETED; + trigger_request_completion_future(request.guid); + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + num_processed_requests++; + + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[Finetuning] guid(%zu) completed_training_steps(%d) " + "processed_finetuning_tokens(%lu) latency(%.1lf)", + request.guid, + request.completed_training_steps, + request.processed_finetuning_tokens, + profile_info.finish_time - profile_info.start_time); + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath, std::ios::app); + if (outputFile.is_open()) { + outputFile << "[Finetuning] guid(" << request.guid + << ") completed_training_steps(" + << request.completed_training_steps + << ") processed_finetuning_tokens(" + << request.processed_finetuning_tokens << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ")\n"; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } + } + } + } + // Step 4: add PEFT bwd requests, if there is additional space while (pending_peft_request_queue.size() > 0) { Request &request = pending_peft_request_queue.front(); - assert(request.req_type = RequestType::REQ_FINETUNING); + // assert(request.req_type = RequestType::REQ_FINETUNING); Request &all_req_handle = all_requests[request.guid]; - assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); + // assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); if (all_req_handle.status == Request::COMPLETED) { pending_peft_request_queue.pop(); } else { break; } } - if (pending_peft_request_queue.size() > 0) { + + if (pending_peft_request_queue.size() > 0 && !inference_finished) { Request &request = pending_peft_request_queue.front(); assert(request.req_type = RequestType::REQ_FINETUNING); assert(request.dataset.size() > 0); // update status and training steps Request &all_req_handle = all_requests[request.guid]; assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); + request.completed_training_steps = all_req_handle.completed_training_steps; + request.processed_finetuning_tokens = + all_req_handle.processed_finetuning_tokens; request.status = all_req_handle.status; + int dataset_entry = + request.completed_training_steps % request.dataset.size(); + request.dataset_entry_processed_tokens = + all_req_handle.dataset_entry_processed_tokens; + assert(request.status != Request::COMPLETED); assert(request.max_training_steps > 0 && request.completed_training_steps < request.max_training_steps); - int num_peft_tokens = request.dataset[0].first.size(); - int num_peft_label_tokens = request.dataset[0].second.size(); - if (num_peft_tokens + new_bc.num_active_tokens() <= - get_max_tokens_per_batch()) { - // The last request slot is reserved for PEFT request - int peft_req_idx = get_max_requests_per_batch() - 1; - assert(new_bc.request_completed[peft_req_idx]); - new_bc.request_completed[peft_req_idx] = false; - new_bc.requestsInfo[peft_req_idx].first_token_depth_in_request = 0; - new_bc.requestsInfo[peft_req_idx].first_token_offset_in_batch = - new_bc.num_tokens; - new_bc.requestsInfo[peft_req_idx].num_tokens_in_batch = num_peft_tokens; - new_bc.requestsInfo[peft_req_idx].max_sequence_length = + assert(request.dataset_entry_processed_tokens <= + request.dataset[dataset_entry].first.size()); + + int num_peft_tokens = + min((int)request.dataset[dataset_entry].first.size() - + request.dataset_entry_processed_tokens, + get_max_tokens_per_batch() - new_bc.num_active_infr_tokens()); + int num_peft_label_tokens = request.dataset[dataset_entry].second.size(); + assert(num_peft_label_tokens == 0); + + if (num_peft_tokens > 0) { + assert(new_bc.request_completed[inference_batch_size]); + // request info + new_bc.request_completed[inference_batch_size] = false; + new_bc.requestsInfo[inference_batch_size].first_token_depth_in_request = + request.dataset_entry_processed_tokens; + new_bc.requestsInfo[inference_batch_size].first_token_offset_in_batch = + new_bc.num_active_infr_tokens(); + new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch = + num_peft_tokens; + new_bc.requestsInfo[inference_batch_size].max_sequence_length = request.max_sequence_length; - new_bc.requestsInfo[peft_req_idx].request_guid = request.guid; - new_bc.requestsInfo[peft_req_idx].peft_model_id = request.peft_model_id; - new_bc.requestsInfo[peft_req_idx].peft_bwd = true; - for (size_t i = 0; i < request.dataset[0].first.size(); i++) { + new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; + new_bc.requestsInfo[inference_batch_size].peft_model_id = + request.peft_model_id; + new_bc.requestsInfo[inference_batch_size].peft_bwd = true; + // tokens info + for (size_t i = request.dataset_entry_processed_tokens; + i < request.dataset_entry_processed_tokens + num_peft_tokens; + i++) { new_bc.tokensInfo[new_bc.num_tokens].token_id = - request.dataset[0].first[i]; - new_bc.tokensInfo[new_bc.num_tokens].request_index = peft_req_idx; + request.dataset[dataset_entry].first[i]; + new_bc.tokensInfo[new_bc.num_tokens].request_index = + inference_batch_size; new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i; new_bc.num_tokens++; new_bc.num_peft_tokens++; } - for (size_t i = 0; i < request.dataset[0].second.size(); i++) { - new_bc.labelsInfo[new_bc.num_peft_label_tokens].token_id = - request.dataset[0].second[i]; - new_bc.labelsInfo[new_bc.num_peft_label_tokens].request_index = - peft_req_idx; - int depth = request.dataset[0].first.size() + i; - new_bc.labelsInfo[new_bc.num_peft_label_tokens].abs_depth_in_request = - depth; - new_bc.num_peft_label_tokens++; - } } } return new_bc; @@ -2568,21 +2621,28 @@ std::vector> std::vector FFModel::generate(std::vector const &requests) { RequestManager *rm = RequestManager::get_request_manager(); - std::vector guids; + std::vector inf_guids, peft_guids; for (int i = 0; i < requests.size(); i++) { RequestManager::RequestGuid guid; if (requests.at(i).req_type == RequestType::REQ_INFERENCE) { guid = rm->register_new_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + inf_guids.push_back(guid); + } } else { guid = rm->register_new_peft_request(requests.at(i)); - } - if (guid != RequestManager::INVALID_GUID) { - guids.push_back(guid); + if (guid != RequestManager::INVALID_GUID) { + peft_guids.push_back(guid); + } } } std::vector results; - for (int i = 0; i < guids.size(); i++) { - results.push_back(rm->get_generation_result(guids[i])); + for (int i = 0; i < inf_guids.size(); i++) { + results.push_back(rm->get_generation_result(inf_guids[i])); + } + rm->set_inference_finished(); + for (int i = 0; i < peft_guids.size(); i++) { + results.push_back(rm->get_generation_result(peft_guids[i])); } return results; } @@ -2740,7 +2800,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime); FutureMap fm = im->inference(llm, 0, bcf); - if (llm->config.enable_peft) { + if (llm->config.enable_peft && !disable_peft_bwd) { im->peft_bwd(llm, 0, bcf); } assert(fm.get_future_map_domain().get_volume() == 1); From d7ebeaf689f0c8d105aebd3984fcdd3f1e144690 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 14 Apr 2024 19:44:45 -0700 Subject: [PATCH 169/198] many upgrades and updates related to finetuning --- .../ops/add_bias_residual_layer_norm.h | 1 + .../ops/inc_multihead_self_attention.h | 1 + include/flexflow/ops/kernels/linear_kernels.h | 1 + .../ops/kernels/lora_linear_kernels.h | 1 + .../ops/kernels/residual_rms_norm_kernels.h | 1 + .../flexflow/ops/kernels/rms_norm_kernels.h | 1 + include/flexflow/ops/layer_norm.h | 1 + include/flexflow/ops/residual_layer_norm.h | 1 + include/flexflow/ops/sigmoid_silu_multi.h | 1 + src/ops/add_bias_residual_layer_norm.cu | 13 +++++++--- src/ops/inc_multihead_self_attention.cu | 25 ++++++++++++++----- src/ops/kernels/linear_kernels.cu | 14 ++++++++--- src/ops/kernels/lora_linear_kernels.cu | 24 ++++++++++++++---- src/ops/kernels/residual_rms_norm_kernels.cu | 13 +++++++--- src/ops/kernels/rms_norm_kernels.cu | 13 +++++++--- src/ops/layer_norm.cu | 13 +++++++--- src/ops/residual_layer_norm.cu | 13 +++++++--- src/ops/sigmoid_silu_multi.cpp | 1 + src/ops/sigmoid_silu_multi.cu | 12 ++++++--- src/runtime/request_manager.cc | 23 +++++++++++------ 20 files changed, 134 insertions(+), 39 deletions(-) diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index 08b7404e14..9510ac0f28 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -159,6 +159,7 @@ class AddBiasResidualLayerNormMeta : public OpMeta { Realm::RegionInstance reserveInst; // PEFT related fields void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 69f2b8bb6a..f77df7c456 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -222,6 +222,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { // PEFT specific fields void *softmax_activation_buffer; void *query_activation_buffer; + size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index bcce9a947a..90e50a0c9a 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -37,6 +37,7 @@ class LinearMeta : public OpMeta { Realm::RegionInstance reserveInst; // PEFT related fields void *output_activation_buffer; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 739b94ed22..32608abce2 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -23,6 +23,7 @@ class LoraLinearMeta : public OpMeta { void *low_rank_activation; void *input_activation; std::unordered_map model_weights; + size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index dfc9937cc3..fd4e0ecf1d 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -39,6 +39,7 @@ class ResidualRMSNormMeta : public OpMeta { Realm::RegionInstance reserveInst; // PEFT related fields void *input_activation; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 46297764ec..475b6d94ed 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -38,6 +38,7 @@ class RMSNormMeta : public OpMeta { Realm::RegionInstance reserveInst; // PEFT related fields void *input_activation; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 17aa4dd504..b5e9538ea6 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -151,6 +151,7 @@ class LayerNormMeta : public OpMeta { Realm::RegionInstance reserveInst; // PEFT related fields void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index a028097905..33a8e8be51 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -145,6 +145,7 @@ class ResidualLayerNormMeta : public OpMeta { Realm::RegionInstance reserveInst; // PEFT related fields void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h index 28e3bfed3e..ac60ff15dd 100644 --- a/include/flexflow/ops/sigmoid_silu_multi.h +++ b/include/flexflow/ops/sigmoid_silu_multi.h @@ -110,6 +110,7 @@ class SigmoidSiluMultiMeta : public OpMeta { Realm::RegionInstance reserveInst; // PEFT related fields void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index 505806a2b9..bcca1ba2c6 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -45,6 +45,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) { @@ -221,12 +222,18 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 83712232bd..d1b93cb206 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1495,12 +1495,18 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->query_activation_buffer = allocator->allocate_instance_untyped( - sizeof(DT) * total_tokens * m->num_q_heads * m->qProjSize); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed > m->allocated_peft_buffer_size1) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size1 = activation_size_needed; + } int parallelism = m->hidden_size * num_tokens; store_query_cache<<requestsInfo[i].peft_bwd) { DT *C_softmax = static_cast
(m->qk_prods_softmax); - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->softmax_activation_buffer = allocator->allocate_instance_untyped( - sizeof(DT) * total_tokens * num_new_tokens * m->num_q_heads); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; + if (activation_size_needed > m->allocated_peft_buffer_size2) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size2 = activation_size_needed; + } checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer, C_softmax, sizeof(DT) * total_tokens * num_new_tokens * @@ -2131,6 +2142,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; cudaStreamSynchronize(stream); } diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index a3f5c797de..b2e0d3dbad 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -63,6 +63,8 @@ LinearMeta::LinearMeta(FFHandler handler, // Allocate descriptors checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + + allocated_peft_buffer_size = 0; } LinearMeta::~LinearMeta(void) { @@ -237,11 +239,17 @@ void inference_kernel_wrapper(LinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->output_activation_buffer = allocator->allocate_instance_untyped( - data_type_size(m->output_type[0]) * num_peft_tokens * out_dim); + size_t activation_size_needed = + data_type_size(m->output_type[0]) * max_peft_tokens * out_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->output_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } // copy output activation if (m->output_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 9cd5d2ecfa..55751d96ba 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -21,7 +21,10 @@ namespace FlexFlow { LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) - : OpMeta(handler, li) {} + : OpMeta(handler, li) { + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; +} LoraLinearMeta::~LoraLinearMeta(void) {} @@ -180,6 +183,7 @@ void inference_kernel(LoraLinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_weights.find(bc->requestsInfo[i].peft_model_id) != m->model_weights.end()); @@ -188,11 +192,21 @@ void inference_kernel(LoraLinearMeta *m, int rank = weight.rank; void *intermediate_result_ptr = nullptr; if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed1 = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + size_t activation_size_needed2 = + data_type_size(m->input_type[1]) * max_peft_tokens * rank; MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); - m->low_rank_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[1]) * num_peft_tokens * rank); + if (activation_size_needed1 > m->allocated_peft_buffer_size1) { + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed1); + m->allocated_peft_buffer_size1 = activation_size_needed1; + } + if (activation_size_needed2 > m->allocated_peft_buffer_size2) { + m->low_rank_activation = + allocator->allocate_instance_untyped(activation_size_needed2); + m->allocated_peft_buffer_size2 = activation_size_needed2; + } // copy input activation checkCUDA(cudaMemcpyAsync(m->input_activation, input_ptr + first_token_offset * in_dim, diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 4b92e70787..0d44f0260a 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -45,6 +45,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { @@ -269,12 +270,18 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index b11e954622..dd6ada864d 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -44,6 +44,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } RMSNormMeta::~RMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { @@ -224,12 +225,18 @@ void inference_kernel_wrapper(RMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } if (input.data_type == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index bfbb2faae9..0801d11617 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -50,6 +50,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } LayerNormMeta::~LayerNormMeta(void) { @@ -254,12 +255,18 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 5e736cd6e8..8cdf87a92c 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -46,6 +46,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } ResidualLayerNormMeta::~ResidualLayerNormMeta(void) { @@ -277,12 +278,18 @@ void ResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->input_activation = allocator->allocate_instance_untyped( - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim); + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync( diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index 0a9a814f5e..bbf27db745 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -26,6 +26,7 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, : OpMeta(handle, ssm) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; + allocated_peft_buffer_size = 0; } SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index e3b6f7a69a..929d557a17 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -129,13 +129,19 @@ void SigmoidSiluMulti::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; size_t input_tensor_size = data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; - m->input_activation = - allocator->allocate_instance_untyped(2 * input_tensor_size); + size_t activation_size_needed = + 2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA(cudaMemcpyAsync(m->input_activation, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index b8ca019d3f..26922e2e95 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -315,6 +315,7 @@ RequestManager::RequestGuid RequestManager::RequestGuid RequestManager::register_new_peft_request(Request const &request_) { + assert(enable_peft_finetuning && "PEFT finetuning is not enabled"); const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; @@ -330,11 +331,18 @@ RequestManager::RequestGuid // Load dataset if (request_.benchmarking_tokens >= 0) { - assert(request_.benchmarking_tokens == get_max_sequence_length()); + assert(request_.benchmarking_tokens <= get_max_sequence_length()); request.benchmarking_tokens = request_.benchmarking_tokens; - request.tokens.insert(request.tokens.end(), - request_.benchmarking_tokens, - 15); // insert random number + std::vector input_tokens; + std::vector output_tokens; + bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON); + if (bos_added) { + input_tokens.push_back(bos_token_id); + } + input_tokens.insert(input_tokens.end(), + request_.benchmarking_tokens - (int)bos_added, + 15); // insert random number + request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); } else { using json = nlohmann::json; std::ifstream file_handle(request.dataset_filepath); @@ -527,12 +535,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); // Step 1: append result from previous iteration to request's tokens - for (int i = 0; i < old_bc.num_active_infr_tokens(); i++) { + for (int i = 0; i < old_bc.num_active_tokens(); i++) { size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; Request &request = all_requests[guid]; - assert(request.req_type == RequestType::REQ_INFERENCE && - "Found misplaced finetuning request"); + if (request.req_type == RequestType::REQ_FINETUNING) { + continue; + } if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { // This is a prompt token continue; From 33e873daa0de872eddf5cd31df7a45ee0a3c408a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 15 Apr 2024 08:48:30 -0700 Subject: [PATCH 170/198] add ttft statistics --- include/flexflow/request_manager.h | 2 ++ src/runtime/request_manager.cc | 46 +++++++++++++++++++++--------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 524d4828ec..ddf798d456 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -328,6 +328,8 @@ class RequestManager { int llm_decoding_steps; int ssm_decoding_steps; double start_time, finish_time; + double registration_time, first_token_time; + bool first_token_time_set = false; }; std::unordered_map profiling_requests; double total_request_run_time; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 26922e2e95..1d1d98fce9 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -310,6 +310,11 @@ RequestManager::RequestGuid gr.output_text = request_.prompt; gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; + + ProfileInfo profile_info; + profile_info.registration_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[request.guid] = profile_info; + return request.guid; } @@ -415,6 +420,11 @@ RequestManager::RequestGuid // gr.output_text = prompt; // gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; + + ProfileInfo profile_info; + profile_info.registration_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[request.guid] = profile_info; + return request.guid; } @@ -546,9 +556,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // This is a prompt token continue; } else { + // This is a decoding token assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 == request.tokens.size()); - // This is a decoding token + if (!profiling_requests[guid].first_token_time_set) { + profiling_requests[guid].first_token_time = + Realm::Clock::current_time_in_microseconds(); + profiling_requests[guid].first_token_time_set = true; + } log_req_mgr.print("Output token is: %d", result.token_ids[i]); request.tokens.push_back(result.token_ids[i]); // std::string output = this->tokenizer_->Decode(request.tokens); @@ -610,12 +625,13 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, profiling_requests[request.guid] = profile_info; log_req_mgr.print( "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", + "finish(%.1lf) latency(%.1lf) ttft(%.1lf)", request.guid, profile_info.llm_decoding_steps, profile_info.start_time, profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + profile_info.finish_time - profile_info.start_time, + profile_info.first_token_time - profile_info.registration_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); @@ -625,6 +641,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, << profile_info.llm_decoding_steps << ") latency(" << std::fixed << std::setprecision(3) << (profile_info.finish_time - profile_info.start_time) + << ") ttft(" << std::fixed << std::setprecision(3) + << (profile_info.first_token_time - + profile_info.registration_time) << ")\n"; outputFile.close(); } else { @@ -717,11 +736,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].prompt_phase = true; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 1; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; + // add start time to profile_info for the new request + profiling_requests[new_request.guid].llm_decoding_steps = 1; + profiling_requests[new_request.guid].start_time = + Realm::Clock::current_time_in_microseconds(); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -1233,13 +1251,13 @@ BeamSearchBatchConfig new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 0; - profile_info.ssm_decoding_steps = 0; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; + profiling_requests[new_request.guid].llm_decoding_steps = 0; + profiling_requests[new_request.guid].ssm_decoding_steps = 0; + profiling_requests[new_request.guid].start_time = + Realm::Clock::current_time_in_microseconds(); // init the beam search metadata per request - int ssm_decoding_steps = profile_info.ssm_decoding_steps; + int ssm_decoding_steps = + profiling_requests[new_request.guid].ssm_decoding_steps; new_bc.beamRequestsInfo[i].beam_size = spec_infer_tree_width.size() > ssm_decoding_steps From 2f92a650289fa7ae2d4e0f201df27f1a31767e47 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 15 Apr 2024 11:23:40 -0700 Subject: [PATCH 171/198] add warmup phase --- include/flexflow/request_manager.h | 3 ++- inference/peft/peft.cc | 32 ++++++++++++++++++++++++++---- src/runtime/request_manager.cc | 29 +++++++++++++++------------ 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index ddf798d456..e8e2e7eefc 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -83,6 +83,7 @@ struct Request { int dataset_entry_processed_tokens = 0; int max_training_steps = 1; int benchmarking_tokens = -1; + bool warmup = false; std::string dataset_filepath; std::vector, std::vector>> @@ -136,7 +137,7 @@ class RequestManager { int get_max_sequence_length(); void set_enable_peft_finetuning(bool enable_peft_finetuning_); void set_disable_peft_bwd(bool disable_peft_bwd_); - static void set_inference_finished(); + static void set_inference_finished(bool finished = true); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, int bos_token_id, diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 5c96709be7..030bf8167d 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -314,7 +314,34 @@ void FlexFlow::top_level_task(Task const *task, // Start background server rm->start_background_server(&model); - int total_num_requests = 0; + // Warmup stage + { + std::vector requests; + for (int i = 0; i < 100; i++) { + Request inference_req; + inference_req.benchmarking_tokens = 256; + inference_req.max_sequence_length = 1024; + inference_req.warmup = true; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.warmup = true; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + std::vector result = model.generate(requests); + } + + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + + // Run workload { std::vector requests; @@ -349,7 +376,6 @@ void FlexFlow::top_level_task(Task const *task, inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); - total_num_requests++; } // Add fine-tuning request @@ -362,7 +388,6 @@ void FlexFlow::top_level_task(Task const *task, // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path; fine_tuning_req.max_training_steps = 1000000000; requests.push_back(fine_tuning_req); - total_num_requests++; std::vector result = model.generate(requests); } @@ -380,7 +405,6 @@ void FlexFlow::top_level_task(Task const *task, free(peft_model_id); } - // float* data std::cout << "----------inference finished--------------" << std::endl; // free tokenizer space in memory diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1d1d98fce9..96b481edf0 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -170,8 +170,8 @@ void RequestManager::set_disable_peft_bwd(bool disable_peft_bwd_) { disable_peft_bwd = disable_peft_bwd_; } -void RequestManager::set_inference_finished() { - inference_finished = true; +void RequestManager::set_inference_finished(bool finished) { + inference_finished = finished; } void RequestManager::register_tokenizer(ModelType type, @@ -250,6 +250,7 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_sequence_length = request_.max_sequence_length; request.peft_model_id = request_.peft_model_id; + request.warmup = request_.warmup; if (bos_token_id >= 0 && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } @@ -333,6 +334,7 @@ RequestManager::RequestGuid request.completed_training_steps = 0; request.max_training_steps = request_.max_training_steps; request.dataset_filepath = request_.dataset_filepath; + request.warmup = request_.warmup; // Load dataset if (request_.benchmarking_tokens >= 0) { @@ -623,21 +625,22 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print( - "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf) ttft(%.1lf)", - request.guid, - profile_info.llm_decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time, - profile_info.first_token_time - profile_info.registration_time); + log_req_mgr.print("[%s] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf) ttft(%.1lf)", + request.warmup ? "Warmup" : "Profile", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time, + profile_info.first_token_time - + profile_info.registration_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "[Profile] guid(" << request.guid - << ") llm_decoding_steps(" + outputFile << "[" << (request.warmup ? "Warmup" : "Profile") + << "] guid(" << request.guid << ") llm_decoding_steps(" << profile_info.llm_decoding_steps << ") latency(" << std::fixed << std::setprecision(3) << (profile_info.finish_time - profile_info.start_time) From b1e97b190067c983308472a46a5b3cf4ec86bb7c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 15 Apr 2024 22:50:03 -0700 Subject: [PATCH 172/198] add benchmarking code --- inference/peft/CMakeLists.txt | 95 ++++++- inference/peft/peft.cc | 4 +- inference/peft/peft_bwd_benchmark.cc | 403 +++++++++++++++++++++++++++ inference/peft/peft_fwd_benchmark.cc | 375 +++++++++++++++++++++++++ src/runtime/request_manager.cc | 4 +- 5 files changed, 864 insertions(+), 17 deletions(-) create mode 100644 inference/peft/peft_bwd_benchmark.cc create mode 100644 inference/peft/peft_fwd_benchmark.cc diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt index 4547907176..9595f691f6 100644 --- a/inference/peft/CMakeLists.txt +++ b/inference/peft/CMakeLists.txt @@ -1,10 +1,10 @@ cmake_minimum_required(VERSION 3.10) project(FlexFlow_Peft) -set(project_target peft) - -set(CPU_SRC +# Normal PEFT +set(project_target1 peft) +set(CPU_SRC1 ${FLEXFLOW_CPP_DRV_SRC} peft.cc ../models/llama.cc @@ -14,25 +14,92 @@ set(CPU_SRC ../models/mpt.cc) if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") - cuda_add_executable(${project_target} ${CPU_SRC}) + cuda_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target1} DESTINATION ${BIN_DEST}) + +# FWD benchmark +set(project_target2 peft_fwd_benchmark) +set(CPU_SRC2 + ${FLEXFLOW_CPP_DRV_SRC} + peft_fwd_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target2} ${CPU_SRC2}) if (FF_GPU_BACKEND STREQUAL "hip_cuda") - target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_NVIDIA__) + target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__) endif() elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") - set_source_files_properties(${CPU_SRC} PROPERTIES LANGUAGE HIP) - hip_add_executable(${project_target} ${CPU_SRC}) + set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target2} ${CPU_SRC2}) if (FF_HIP_ARCH STREQUAL "") message(FATAL_ERROR "FF_HIP_ARCH is empty!") endif() - set_property(TARGET ${project_target} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") - target_compile_definitions(${project_target} PRIVATE __HIP_PLATFORM_AMD__) + set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__) else() - message(FATAL_ERROR "Compilation of ${project_target} for ${FF_GPU_BACKEND} backend not yet supported") + message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported") endif() -target_include_directories(${project_target} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) -target_include_directories(${project_target} PRIVATE ${CMAKE_SOURCE_DIR}/inference) -target_link_libraries(${project_target} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target2} DESTINATION ${BIN_DEST}) + +# BWD benchmark +set(project_target3 peft_bwd_benchmark) +set(CPU_SRC3 + ${FLEXFLOW_CPP_DRV_SRC} + peft_bwd_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported") +endif() +target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) set(BIN_DEST "bin") -install(TARGETS ${project_target} DESTINATION ${BIN_DEST}) +install(TARGETS ${project_target3} DESTINATION ${BIN_DEST}) diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 030bf8167d..ab2f9496bf 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -319,8 +319,8 @@ void FlexFlow::top_level_task(Task const *task, std::vector requests; for (int i = 0; i < 100; i++) { Request inference_req; - inference_req.benchmarking_tokens = 256; - inference_req.max_sequence_length = 1024; + inference_req.benchmarking_tokens = 128; + inference_req.max_sequence_length = 256; inference_req.warmup = true; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc new file mode 100644 index 0000000000..a5f451350e --- /dev/null +++ b/inference/peft/peft_bwd_benchmark.cc @@ -0,0 +1,403 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +LegionRuntime::Logger::Category log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_requests_to_run, + bool &enable_peft_finetuning, + bool &disable_peft_bwd) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-to-run")) { + max_requests_to_run = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-enable-peft-finetuning")) { + enable_peft_finetuning = true; + continue; + } + if (!strcmp(argv[i], "-disable-peft-bwd")) { + disable_peft_bwd = true; + continue; + } + } + if (paths.cache_folder_path.empty()) { + paths.cache_folder_path = "~/.cache/flexflow"; + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_requests_to_run = 1000000000; + bool enable_peft_finetuning = false; + bool disable_peft_bwd = false; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_requests_to_run, + enable_peft_finetuning, + disable_peft_bwd); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + rm->set_disable_peft_bwd(disable_peft_bwd); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + // Start background server + rm->start_background_server(&model); + + // Warmup stage + { + std::vector requests; + for (int i = 0; i < 100; i++) { + Request inference_req; + inference_req.benchmarking_tokens = 128; + inference_req.max_sequence_length = 256; + inference_req.warmup = true; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.warmup = true; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + std::vector result = model.generate(requests); + } + + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + + // Run workload + { + std::vector requests; + + // Add inference requests + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector lengths; + int index = 0; + for (auto &entry : prompt_json) { + if (index == max_requests_to_run) { + break; + } + int prompt_length = entry.get(); + assert(prompt_length > 0 && "Prompt length must be greater than 0."); + assert(prompt_length <= 1024 && + "Prompt length must be less than or equal to 1024."); + lengths.push_back(prompt_length); + index++; + } + printf("Total number of finetuning requests: %d", lengths.size()); + + // Add fine-tuning requests + for (int i = 0; i < lengths.size(); i++) { + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = lengths[i]; + fine_tuning_req.max_sequence_length = lengths[i]; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + } + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------finetuning finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc new file mode 100644 index 0000000000..215b2f80f4 --- /dev/null +++ b/inference/peft/peft_fwd_benchmark.cc @@ -0,0 +1,375 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +LegionRuntime::Logger::Category log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_requests_to_run, + bool &enable_peft_finetuning, + bool &disable_peft_bwd) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-to-run")) { + max_requests_to_run = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-enable-peft-finetuning")) { + enable_peft_finetuning = true; + continue; + } + if (!strcmp(argv[i], "-disable-peft-bwd")) { + disable_peft_bwd = true; + continue; + } + } + if (paths.cache_folder_path.empty()) { + paths.cache_folder_path = "~/.cache/flexflow"; + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_requests_to_run = 1000000000; + bool enable_peft_finetuning = false; + bool disable_peft_bwd = false; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_requests_to_run, + enable_peft_finetuning, + disable_peft_bwd); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + rm->set_disable_peft_bwd(disable_peft_bwd); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + // Start background server + rm->start_background_server(&model); + + // Run workload + { + std::vector requests; + + // Add inference requests + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector> prompts; + int index = 0; + for (auto &entry : prompt_json) { + if (index >= max_requests_to_run) { + break; + } + int prompt_length = entry["human"]; + int sequence_length = entry["gpt"]; + assert(prompt_length + sequence_length <= max_sequence_length && + "Prompt + sequence length exceeds max sequence length"); + prompts.push_back(std::make_pair(prompt_length, sequence_length)); + index++; + } + printf("Total number of prompts: %d", prompts.size()); + for (auto &prompt : prompts) { + // printf("Prompt length: %d, sequence length: %d\n", prompt_length, + // sequence_length); + Request inference_req; + inference_req.benchmarking_tokens = prompt.first; + inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 96b481edf0..eee13c4cc6 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2670,7 +2670,9 @@ std::vector for (int i = 0; i < inf_guids.size(); i++) { results.push_back(rm->get_generation_result(inf_guids[i])); } - rm->set_inference_finished(); + if (inf_guids.size() > 0) { + rm->set_inference_finished(); + } for (int i = 0; i < peft_guids.size(); i++) { results.push_back(rm->get_generation_result(peft_guids[i])); } From e35ebb2ced300bd22b43220518a05db0d1eb78ca Mon Sep 17 00:00:00 2001 From: Remi <54138269+Flechman@users.noreply.github.com> Date: Wed, 17 Apr 2024 04:56:35 -0400 Subject: [PATCH 173/198] Add scripts for evaluation with Microsoft Azure trace (#1363) * Add scripts for evaluation * Add absolute request rate value * Fix script for target arrival rate * Fix cpp req rate benchmark * update to use new dataset * Fix infinite loop * update * add data --------- Co-authored-by: Remi Delacourt Co-authored-by: Gabriele Oliaro --- include/flexflow/request_manager.h | 1 + inference/peft/CMakeLists.txt | 34 ++ inference/peft/req_rate_benchmark.cc | 530 +++++++++++++++++++++++++++ rdelacou/generate_trace.py | 121 ++++++ src/runtime/request_manager.cc | 19 +- 5 files changed, 702 insertions(+), 3 deletions(-) create mode 100644 inference/peft/req_rate_benchmark.cc create mode 100644 rdelacou/generate_trace.py diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index e8e2e7eefc..ba8a5833ee 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -83,6 +83,7 @@ struct Request { int dataset_entry_processed_tokens = 0; int max_training_steps = 1; int benchmarking_tokens = -1; + std::vectorfinetuning_tokens_per_batch; bool warmup = false; std::string dataset_filepath; std::vector, diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt index 9595f691f6..e0bad79cab 100644 --- a/inference/peft/CMakeLists.txt +++ b/inference/peft/CMakeLists.txt @@ -103,3 +103,37 @@ target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/infere target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) set(BIN_DEST "bin") install(TARGETS ${project_target3} DESTINATION ${BIN_DEST}) + +# Online peft +set(project_target4 req_rate_benchmark) +set(CPU_SRC4 + ${FLEXFLOW_CPP_DRV_SRC} + req_rate_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target4} ${CPU_SRC4}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC4} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target4} ${CPU_SRC4}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target4} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target4} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target4} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target4} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target4} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target4} DESTINATION ${BIN_DEST}) diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc new file mode 100644 index 0000000000..bc40de68f8 --- /dev/null +++ b/inference/peft/req_rate_benchmark.cc @@ -0,0 +1,530 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "inference/models/falcon.h" +#include "inference/models/llama.h" +#include "inference/models/mpt.h" +#include "inference/models/opt.h" +#include "inference/models/starcoder.h" +#include +#include +#include +#include + +#include + + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +LegionRuntime::Logger::Category log_app("llama"); + +class ConcurrentQueue { +public: + std::queue inf_queue; + std::queue peft_queue; + std::mutex request_queue_mutex; + bool producer_finished = false; +}; + +ConcurrentQueue *common_guids_singleton = nullptr; +int nb_millisecs = 1000; // Default bucket timeframe is 1 second + +ConcurrentQueue *get_common_guids_queue() { + if (common_guids_singleton == nullptr) { + common_guids_singleton = new ConcurrentQueue(); + } + return common_guids_singleton; +} + +void consume() { + RequestManager *rm = RequestManager::get_request_manager(); + ConcurrentQueue *guids = get_common_guids_queue(); + bool producer_is_finished = false; + bool queue_is_empty = false; + // int i=0; + while(!producer_is_finished || !queue_is_empty) { + RequestManager::RequestGuid guid = RequestManager::INVALID_GUID; + { + const std::lock_guard lock(guids->request_queue_mutex); + queue_is_empty = guids->inf_queue.empty(); + producer_is_finished = guids->producer_finished; + if (!queue_is_empty) { + guid = guids->inf_queue.front(); + guids->inf_queue.pop(); + } + } + if (guid != RequestManager::INVALID_GUID) { + GenerationResult result = rm->get_generation_result(guid); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(nb_millisecs)); + } + // i++; + // cout << "Iteration " << i; + } + rm->set_inference_finished(); + + while (guids->peft_queue.size() > 0) { + GenerationResult result = rm->get_generation_result(guids->peft_queue.front()); + guids->peft_queue.pop(); + } +} + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_buckets_to_run, + bool &enable_peft_finetuning, + bool &disable_peft_bwd, + int &bucket_timeframe) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-buckets-to-run")) { + max_buckets_to_run = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "-enable-peft-finetuning")) { + enable_peft_finetuning = true; + continue; + } + if (!strcmp(argv[i], "-disable-peft-bwd")) { + disable_peft_bwd = true; + continue; + } + if (!strcmp(argv[i], "--bucket-timeframe")) { + bucket_timeframe = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_buckets_to_run = 1000000000; + bool enable_peft_finetuning = false; + bool disable_peft_bwd = false; + int bucket_timespan = 1; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_buckets_to_run, + enable_peft_finetuning, + disable_peft_bwd, + bucket_timespan); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + rm->set_disable_peft_bwd(disable_peft_bwd); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + rm->start_background_server(&model); + + // Warmup stage + { + std::vector requests; + for (int i = 0; i < 100; i++) { + Request inference_req; + inference_req.benchmarking_tokens = 128; + inference_req.max_sequence_length = 256; + inference_req.warmup = true; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.warmup = true; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + std::vector result = model.generate(requests); + } + + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + + // Now run online workload! + + nb_millisecs = nb_millisecs * bucket_timespan; + int total_num_requests = 0; + int num_arrival_buckets = 0; + ConcurrentQueue *guids = get_common_guids_queue(); + std::thread consumer{consume}; + { + + // Load all requests in advance + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + const auto& lists = prompt_json.get>>(); + std::vector bucket_arrival_times_s; + std::vector>> buckets; + + size_t index=0; + for (const auto& list : lists) { + if (!list.empty()) { + bucket_arrival_times_s.push_back(index); + std::vector> prompts; + for (const auto& dict : list) { + int prompt_length = dict["human"]; + int sequence_length = dict["gpt"]; + assert(prompt_length + sequence_length <= max_sequence_length && + "Prompt + sequence length exceeds max sequence length"); + prompts.push_back(std::make_pair(prompt_length, sequence_length)); + } + buckets.push_back(prompts); + } + index++; + } + assert(bucket_arrival_times_s.size() == buckets.size() && + "Bucket arrival times and buckets are not the same size"); + // for (int i=0; i<10; i++) { + // printf("bucket_arrival_times_s[%i]: %i\n", i, bucket_arrival_times_s[i]); + // printf("bucket[%i]: %i\n", i, buckets[i].size()); + // for (const auto& prompt : buckets[i]) { + // printf("\tprompt: %i, %i\n", prompt.first, prompt.second); + // } + // } + + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1000000000; + RequestManager::RequestGuid ft_guid = rm->register_new_peft_request(fine_tuning_req); + if (ft_guid != RequestManager::INVALID_GUID) { + const std::lock_guard lock(guids->request_queue_mutex); + guids->peft_queue.push(ft_guid); + } + + // Replay the trace of inference requests + auto start_time = std::chrono::steady_clock::now(); + for (int i=0; i= max_buckets_to_run) { + break; + } + // sleep until bucket arrives + auto bucket_arrival_time = start_time + std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs); + std::this_thread::sleep_until(bucket_arrival_time); + + // create inference requests for the bucket + std::vector requests; + for (const auto& prompt : buckets[i]) { + // printf("Prompt length: %d, sequence length: %d\n", prompt_length, + // sequence_length); + Request inference_req; + inference_req.benchmarking_tokens = prompt.first; + inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + { + const std::lock_guard lock(guids->request_queue_mutex); + for (int i = 0; i < requests.size(); i++) { + RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + guids->inf_queue.push(guid); + } + } + } + } + + { // Notify the consumer that no more requests are incoming + const std::lock_guard lock(guids->request_queue_mutex); + guids->producer_finished = true; + } + } + + // Wait for consumer to finish + consumer.join(); + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + + + // float* data + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/rdelacou/generate_trace.py b/rdelacou/generate_trace.py new file mode 100644 index 0000000000..986dab37df --- /dev/null +++ b/rdelacou/generate_trace.py @@ -0,0 +1,121 @@ +import pandas as pd +from math import ceil +from random import shuffle, uniform +import json, pickle, requests, os, argparse + +class TraceBuilder(object): + + # trace_type: either "conv" or "code" + def __init__(self, import_times=True, import_prompts=True): + self.req_times = None + self.imported_req_times = False + self.prompt_data = None + self.imported_prompt_data = False + if import_times: + self.import_trace_timestamps() + if import_prompts: + self.import_prompt_data() + + def import_trace_timestamps(self, trace_type="conv"): + if not self.imported_req_times: + # Import Microsoft LLM 1 hour trace + df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"]) + req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds + req_times = req_times - req_times.min() + self.req_times = req_times.tolist() + self.imported_req_times = True + + def import_prompt_data(self, shuffle_=True): + if not self.imported_prompt_data: + sharegpt_filename = "sharegpt_opt_text_completion_length.pkl" + sharegpt_filepath = f"./{sharegpt_filename}" + if os.path.exists(sharegpt_filepath): + os.remove("sharegpt_opt_text_completion_length.pkl") + sharegpt_url = f"https://github.com/sosp-ae-39/sosp-ae-astra/raw/main/datasets/{sharegpt_filename}" + response = requests.get(sharegpt_url) + with open(sharegpt_filename, "wb") as file: + file.write(response.content) + with open(sharegpt_filepath, 'rb') as f: + data2 = pickle.load(f) + os.remove("sharegpt_opt_text_completion_length.pkl") + + prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + + for pair in data2: + assert(len(pair) == 2) + + prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + num_pairs = len(prompt_lengths) + assert(num_pairs == len(generation_lengths)) + print("Number of conversation pairs: ", num_pairs) + + print(f"Prompt lengths: min={min(prompt_lengths)}, max={max(prompt_lengths)}, avg={sum(prompt_lengths)/len(prompt_lengths)}") + print(f"Generation lengths: min={min(generation_lengths)}, max={max(generation_lengths)}, avg={sum(generation_lengths)/len(generation_lengths)}") + total_lengths = [prompt_lengths[i] + generation_lengths[i] for i in range(len(prompt_lengths))] + print(f"Total lengths: min={min(total_lengths)}, max={max(total_lengths)}, avg={sum(total_lengths)/len(total_lengths)}") + + self.prompt_data = [{"human": prompt_lengths[i], "gpt": generation_lengths[i]} for i in range(num_pairs)] + + if shuffle_: + shuffle(self.prompt_data) + self.imported_prompt_data = True + + # Delta is in seconds + # Rate is in req per second + def generate_trace(self, target_arrival_rate=10, debug_verbose=False): + self.import_trace_timestamps() + self.import_prompt_data() + + microsec = 1000000 + avg_arrival_rate = len(self.req_times) / (self.req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude + if debug_verbose: + print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate) + scale_factor = float(target_arrival_rate) / avg_arrival_rate + if debug_verbose: + print("Scale factor to obtain target arrival rate: ", scale_factor) + + # Buckets are 1 second timeframes + nb_buckets = ceil(self.req_times[-1] / microsec) + buckets = [] + j = 0 + k = 0 + for i in range(nb_buckets): + bucket_size = 0 + while(j < len(self.req_times) and self.req_times[j] >= i*microsec and self.req_times[j] < (i+1)*microsec): + bucket_size += 1 + j += 1 + bucket_size = bucket_size*scale_factor + prob = bucket_size - int(bucket_size) + bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob) + + # If used all of the prompt data, loop back at the beggining and reuse some prompts + if k+bucket_size > len(self.prompt_data): + bucket = self.prompt_data[k:] + self.prompt_data[:(k+bucket_size)%len(self.prompt_data)] + else: + bucket = self.prompt_data[k:k+bucket_size] + k = (k+bucket_size) % len(self.prompt_data) + buckets.append(bucket) + + if debug_verbose: + print("Avg arrival rate obtained (req/s): ", sum([len(b) for b in buckets])/len(buckets)) + return buckets + +def generate_and_save_trace(arrival_rate, output_file): + builder = TraceBuilder() + trace = builder.generate_trace(target_arrival_rate=arrival_rate, debug_verbose=True) + with open(output_file, 'w+') as f: + json.dump(trace, f, indent=2) + +if __name__ == '__main__': + # Set up the argument parser + parser = argparse.ArgumentParser(description='Generate and save a trace.') + parser.add_argument('--arrival-rate', type=float, default=10.0, help='The target arrival rate for the trace.') + parser.add_argument('--output-file', type=str, default='sharegpt.json', help='The path to the output file to save the trace.') + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the function with the user-provided arrival rate + generate_and_save_trace(args.arrival_rate, args.output_file) diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index eee13c4cc6..b1ca4d985a 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -771,6 +771,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; request.processed_finetuning_tokens += old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; + request.finetuning_tokens_per_batch.push_back( + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch); int dataset_entry = request.completed_training_steps % request.dataset.size(); if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request + @@ -798,8 +800,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print("[Finetuning] guid(%zu) completed_training_steps(%d) " + log_req_mgr.print("[%s] guid(%zu) completed_training_steps(%d) " "processed_finetuning_tokens(%lu) latency(%.1lf)", + request.warmup ? "Warmup" : "Finetuning", request.guid, request.completed_training_steps, request.processed_finetuning_tokens, @@ -807,14 +810,24 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "[Finetuning] guid(" << request.guid + std::string tokens_str = "["; + for (size_t i = 0; i < request.finetuning_tokens_per_batch.size(); i++) { + tokens_str += std::to_string(request.finetuning_tokens_per_batch[i]); + if (i != request.finetuning_tokens_per_batch.size() - 1) { + tokens_str += ", "; + } + } + tokens_str += "]"; + outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning") + << "] guid(" << request.guid << ") completed_training_steps(" << request.completed_training_steps << ") processed_finetuning_tokens(" << request.processed_finetuning_tokens << ") latency(" << std::fixed << std::setprecision(3) << (profile_info.finish_time - profile_info.start_time) - << ")\n"; + << ") tokens_per_batch(" + << tokens_str << ")\n"; outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath From b33f10f4015db431b093adfbdae0dd35872242d3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 25 Apr 2024 04:04:59 +0000 Subject: [PATCH 174/198] fix --- include/flexflow/request_manager.h | 2 +- inference/peft/peft.cc | 115 +++++++++----------------- inference/peft/peft_bwd_benchmark.cc | 34 ++++---- inference/peft/req_rate_benchmark.cc | 46 ++++++----- python/flexflow/core/flexflow_cffi.py | 17 ++-- src/runtime/request_manager.cc | 11 +-- tests/peft_test.sh | 13 ++- 7 files changed, 109 insertions(+), 129 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index ba8a5833ee..729f1b480c 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -83,7 +83,7 @@ struct Request { int dataset_entry_processed_tokens = 0; int max_training_steps = 1; int benchmarking_tokens = -1; - std::vectorfinetuning_tokens_per_batch; + std::vector finetuning_tokens_per_batch; bool warmup = false; std::string dataset_filepath; std::vector, diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index ab2f9496bf..e3503d98ee 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -33,6 +33,7 @@ LegionRuntime::Logger::Category log_app("llama"); struct FilePaths { std::string cache_folder_path; std::string prompt_file_path; + std::string dataset_file_path; std::string output_file_path; }; @@ -50,7 +51,6 @@ void parse_input_args(char **argv, int &max_requests_per_batch, int &max_tokens_per_batch, int &max_sequence_length, - int &max_requests_to_run, bool &enable_peft_finetuning, bool &disable_peft_bwd) { for (int i = 1; i < argc; i++) { @@ -83,6 +83,11 @@ void parse_input_args(char **argv, paths.prompt_file_path = std::string(argv[++i]); continue; } + // dataset for finetuning + if (!strcmp(argv[i], "-finetuning-dataset")) { + paths.dataset_file_path = std::string(argv[++i]); + continue; + } // output file if (!strcmp(argv[i], "-output-file")) { paths.output_file_path = std::string(argv[++i]); @@ -121,10 +126,6 @@ void parse_input_args(char **argv, max_sequence_length = std::stoi(argv[++i]); continue; } - if (!strcmp(argv[i], "--max-requests-to-run")) { - max_requests_to_run = std::stoi(argv[++i]); - continue; - } if (!strcmp(argv[i], "-enable-peft-finetuning")) { enable_peft_finetuning = true; continue; @@ -160,11 +161,10 @@ void FlexFlow::top_level_task(Task const *task, bool enable_peft = false; float temperature = 0.0f; float topp = 0.0f; - int max_requests_per_batch = 8; + int max_requests_per_batch = 1; int max_tokens_per_batch = 128; int max_sequence_length = 256; - int max_requests_to_run = 1000000000; - bool enable_peft_finetuning = false; + bool enable_peft_finetuning = true; bool disable_peft_bwd = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); @@ -184,7 +184,6 @@ void FlexFlow::top_level_task(Task const *task, max_requests_per_batch, max_tokens_per_batch, max_sequence_length, - max_requests_to_run, enable_peft_finetuning, disable_peft_bwd); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * @@ -314,81 +313,47 @@ void FlexFlow::top_level_task(Task const *task, // Start background server rm->start_background_server(&model); - // Warmup stage - { - std::vector requests; - for (int i = 0; i < 100; i++) { - Request inference_req; - inference_req.benchmarking_tokens = 128; - inference_req.max_sequence_length = 256; - inference_req.warmup = true; - inference_req.peft_model_id = - (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; - requests.push_back(inference_req); - } - Request fine_tuning_req; - fine_tuning_req.req_type = RequestType::REQ_FINETUNING; - fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; - fine_tuning_req.warmup = true; - fine_tuning_req.peft_model_id = - (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; - fine_tuning_req.max_training_steps = 1; - requests.push_back(fine_tuning_req); - std::vector result = model.generate(requests); - } - - rm->set_inference_finished(false); // reset inference finished flag - std::cout << "----------warmup finished--------------" << std::endl; - // Run workload { std::vector requests; // Add inference requests - using json = nlohmann::json; - std::ifstream file_handle(file_paths.prompt_file_path); - assert(file_handle.good() && "Prompt file does not exist."); - json prompt_json = json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - std::vector> prompts; - int index = 0; - for (auto &entry : prompt_json) { - if (index >= max_requests_to_run) { - break; + if (!file_paths.prompt_file_path.empty()) { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + int total_num_requests = 0; + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str()); + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + total_num_requests++; } - int prompt_length = entry["human"]; - int sequence_length = entry["gpt"]; - assert(prompt_length + sequence_length <= max_sequence_length && - "Prompt + sequence length exceeds max sequence length"); - prompts.push_back(std::make_pair(prompt_length, sequence_length)); - index++; - } - printf("Total number of prompts: %d", prompts.size()); - for (auto &prompt : prompts) { - // printf("Prompt length: %d, sequence length: %d\n", prompt_length, - // sequence_length); - Request inference_req; - inference_req.benchmarking_tokens = prompt.first; - inference_req.max_sequence_length = prompt.second + prompt.first; - inference_req.peft_model_id = - (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; - requests.push_back(inference_req); } // Add fine-tuning request - Request fine_tuning_req; - fine_tuning_req.req_type = RequestType::REQ_FINETUNING; - fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; - fine_tuning_req.peft_model_id = - (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; - // fine_tuning_req.dataset_filepath = file_paths.prompt_file_path; - fine_tuning_req.max_training_steps = 1000000000; - requests.push_back(fine_tuning_req); - + if (enable_peft_finetuning) { + assert(!file_paths.dataset_file_path.empty() && + "Dataset file path is required for fine-tuning."); + printf("Finetuning request with dataset %s\n", + file_paths.dataset_file_path.c_str()); + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.dataset_filepath = file_paths.dataset_file_path; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + } std::vector result = model.generate(requests); } diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc index a5f451350e..72ebe87227 100644 --- a/inference/peft/peft_bwd_benchmark.cc +++ b/inference/peft/peft_bwd_benchmark.cc @@ -356,28 +356,28 @@ void FlexFlow::top_level_task(Task const *task, std::vector lengths; int index = 0; for (auto &entry : prompt_json) { - if (index == max_requests_to_run) { - break; - } - int prompt_length = entry.get(); - assert(prompt_length > 0 && "Prompt length must be greater than 0."); - assert(prompt_length <= 1024 && - "Prompt length must be less than or equal to 1024."); - lengths.push_back(prompt_length); - index++; + if (index == max_requests_to_run) { + break; + } + int prompt_length = entry.get(); + assert(prompt_length > 0 && "Prompt length must be greater than 0."); + assert(prompt_length <= 1024 && + "Prompt length must be less than or equal to 1024."); + lengths.push_back(prompt_length); + index++; } printf("Total number of finetuning requests: %d", lengths.size()); // Add fine-tuning requests for (int i = 0; i < lengths.size(); i++) { - Request fine_tuning_req; - fine_tuning_req.req_type = RequestType::REQ_FINETUNING; - fine_tuning_req.benchmarking_tokens = lengths[i]; - fine_tuning_req.max_sequence_length = lengths[i]; - fine_tuning_req.peft_model_id = - (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; - fine_tuning_req.max_training_steps = 1; - requests.push_back(fine_tuning_req); + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = lengths[i]; + fine_tuning_req.max_sequence_length = lengths[i]; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); } std::vector result = model.generate(requests); } diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc index bc40de68f8..08b087faed 100644 --- a/inference/peft/req_rate_benchmark.cc +++ b/inference/peft/req_rate_benchmark.cc @@ -21,13 +21,12 @@ #include "inference/models/opt.h" #include "inference/models/starcoder.h" #include -#include #include +#include #include #include - using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; @@ -58,7 +57,7 @@ void consume() { bool producer_is_finished = false; bool queue_is_empty = false; // int i=0; - while(!producer_is_finished || !queue_is_empty) { + while (!producer_is_finished || !queue_is_empty) { RequestManager::RequestGuid guid = RequestManager::INVALID_GUID; { const std::lock_guard lock(guids->request_queue_mutex); @@ -78,9 +77,10 @@ void consume() { // cout << "Iteration " << i; } rm->set_inference_finished(); - + while (guids->peft_queue.size() > 0) { - GenerationResult result = rm->get_generation_result(guids->peft_queue.front()); + GenerationResult result = + rm->get_generation_result(guids->peft_queue.front()); guids->peft_queue.pop(); } } @@ -422,21 +422,21 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - - const auto& lists = prompt_json.get>>(); + + auto const &lists = prompt_json.get>>(); std::vector bucket_arrival_times_s; std::vector>> buckets; - size_t index=0; - for (const auto& list : lists) { + size_t index = 0; + for (auto const &list : lists) { if (!list.empty()) { bucket_arrival_times_s.push_back(index); std::vector> prompts; - for (const auto& dict : list) { + for (auto const &dict : list) { int prompt_length = dict["human"]; int sequence_length = dict["gpt"]; assert(prompt_length + sequence_length <= max_sequence_length && - "Prompt + sequence length exceeds max sequence length"); + "Prompt + sequence length exceeds max sequence length"); prompts.push_back(std::make_pair(prompt_length, sequence_length)); } buckets.push_back(prompts); @@ -446,9 +446,9 @@ void FlexFlow::top_level_task(Task const *task, assert(bucket_arrival_times_s.size() == buckets.size() && "Bucket arrival times and buckets are not the same size"); // for (int i=0; i<10; i++) { - // printf("bucket_arrival_times_s[%i]: %i\n", i, bucket_arrival_times_s[i]); - // printf("bucket[%i]: %i\n", i, buckets[i].size()); - // for (const auto& prompt : buckets[i]) { + // printf("bucket_arrival_times_s[%i]: %i\n", i, + // bucket_arrival_times_s[i]); printf("bucket[%i]: %i\n", i, + // buckets[i].size()); for (const auto& prompt : buckets[i]) { // printf("\tprompt: %i, %i\n", prompt.first, prompt.second); // } // } @@ -461,7 +461,8 @@ void FlexFlow::top_level_task(Task const *task, fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; fine_tuning_req.max_training_steps = 1000000000; - RequestManager::RequestGuid ft_guid = rm->register_new_peft_request(fine_tuning_req); + RequestManager::RequestGuid ft_guid = + rm->register_new_peft_request(fine_tuning_req); if (ft_guid != RequestManager::INVALID_GUID) { const std::lock_guard lock(guids->request_queue_mutex); guids->peft_queue.push(ft_guid); @@ -469,17 +470,19 @@ void FlexFlow::top_level_task(Task const *task, // Replay the trace of inference requests auto start_time = std::chrono::steady_clock::now(); - for (int i=0; i= max_buckets_to_run) { break; } // sleep until bucket arrives - auto bucket_arrival_time = start_time + std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs); + auto bucket_arrival_time = + start_time + + std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs); std::this_thread::sleep_until(bucket_arrival_time); // create inference requests for the bucket std::vector requests; - for (const auto& prompt : buckets[i]) { + for (auto const &prompt : buckets[i]) { // printf("Prompt length: %d, sequence length: %d\n", prompt_length, // sequence_length); Request inference_req; @@ -493,14 +496,15 @@ void FlexFlow::top_level_task(Task const *task, { const std::lock_guard lock(guids->request_queue_mutex); for (int i = 0; i < requests.size(); i++) { - RequestManager::RequestGuid guid = rm->register_new_request(requests.at(i)); + RequestManager::RequestGuid guid = + rm->register_new_request(requests.at(i)); if (guid != RequestManager::INVALID_GUID) { guids->inf_queue.push(guid); } } } } - + { // Notify the consumer that no more requests are incoming const std::lock_guard lock(guids->request_queue_mutex); guids->producer_finished = true; @@ -519,8 +523,6 @@ void FlexFlow::top_level_task(Task const *task, future.get_void_result(); } - - // float* data std::cout << "----------inference finished--------------" << std::endl; diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 82c3eb059c..b08fdba072 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1599,18 +1599,19 @@ def register_ssm_model(self, model): def set_max_requests_per_batch(self, max_requests): return ffc().flexflow_request_manager_set_max_requests_per_batch( - self.handle, max_requests - ) - + self.handle, max_requests) + def set_max_tokens_per_batch(self, max_tokens): return ffc().flexflow_request_manager_set_max_tokens_per_batch( - self.handle, max_tokens - ) - + self.handle, max_tokens) + + def set_max_spec_tree_token_num(self, max_tokens): + return ffc().flexflow_request_manager_set_max_spec_tree_token_num( + self.handle, max_tokens) + def set_max_sequence_length(self, max_length): return ffc().flexflow_request_manager_set_max_sequence_length( - self.handle, max_length - ) + self.handle, max_length) def start_server(self, model): return ffc().flexflow_request_manager_start_background_server( diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index b1ca4d985a..6a4d9658e0 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -811,14 +811,16 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { std::string tokens_str = "["; - for (size_t i = 0; i < request.finetuning_tokens_per_batch.size(); i++) { - tokens_str += std::to_string(request.finetuning_tokens_per_batch[i]); + for (size_t i = 0; i < request.finetuning_tokens_per_batch.size(); + i++) { + tokens_str += + std::to_string(request.finetuning_tokens_per_batch[i]); if (i != request.finetuning_tokens_per_batch.size() - 1) { tokens_str += ", "; } } tokens_str += "]"; - outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning") + outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning") << "] guid(" << request.guid << ") completed_training_steps(" << request.completed_training_steps @@ -826,8 +828,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, << request.processed_finetuning_tokens << ") latency(" << std::fixed << std::setprecision(3) << (profile_info.finish_time - profile_info.start_time) - << ") tokens_per_batch(" - << tokens_str << ")\n"; + << ") tokens_per_batch(" << tokens_str << ")\n"; outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 9b4a5204ac..b32b69cd82 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -14,6 +14,8 @@ fi # Create test prompt file mkdir -p ../inference/prompt echo '["Two things are infinite: "]' > ../inference/prompt/peft.json +echo "[\"“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.“\"]" > ../inference/prompt/peft_dataset.json + # Create output folder mkdir -p ../inference/output @@ -26,7 +28,16 @@ python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full -- # if first time, add: --refresh-cache # CPP test -../build/inference/peft/peft -ll:gpu 1 -ll:cpu 4 -ll:fsize 8192 -ll:zsize 12000 -ll:util 4 -llm-model JackFram/llama-160m -prompt ../inference/prompt/peft.json -peft-model goliaro/llama-160m-lora-full --use-full-precision --inference-debugging --fusion -enable-peft +../build/inference/peft/peft \ + -ll:gpu 1 -ll:cpu 4 -ll:util 4 \ + -ll:fsize 8192 -ll:zsize 12000 \ + -llm-model JackFram/llama-160m \ + -finetuning-dataset ../inference/prompt/peft_dataset.json \ + -peft-model goliaro/llama-160m-lora-full \ + --use-full-precision \ + --inference-debugging \ + --fusion \ + -enable-peft # Python test python ../inference/python/ff_peft.py From 97562d6258c87d1bab39b9363b8348a3468d55c4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 1 May 2024 22:47:55 +0000 Subject: [PATCH 175/198] fix --- include/flexflow/flexflow_c.h | 3 +++ include/flexflow/request_manager.h | 2 -- inference/peft/peft.cc | 18 ++---------------- inference/peft/peft_bwd_benchmark.cc | 20 +++----------------- inference/peft/peft_fwd_benchmark.cc | 20 +++----------------- inference/peft/req_rate_benchmark.cc | 14 -------------- inference/python/ff_peft.py | 1 + python/flexflow/core/flexflow_cffi.py | 4 ++++ python/flexflow/serve/serve.py | 14 ++++++++------ src/c/flexflow_c.cc | 8 ++++++++ src/runtime/request_manager.cc | 6 +----- tests/peft_test.sh | 4 ++-- 12 files changed, 35 insertions(+), 79 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index d6cdb910c4..b651b31052 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -989,6 +989,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num( void flexflow_request_manager_set_max_sequence_length( flexflow_request_manager_t handle_, int max_seq_length); +void flexflow_request_manager_set_enable_peft_finetuning( + flexflow_request_manager_t handle_, bool enable_peft_finetuning_); + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 729f1b480c..fe0e4b2f9d 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -137,7 +137,6 @@ class RequestManager { void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); void set_enable_peft_finetuning(bool enable_peft_finetuning_); - void set_disable_peft_bwd(bool disable_peft_bwd_); static void set_inference_finished(bool finished = true); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, @@ -287,7 +286,6 @@ class RequestManager { // peft benchmarking bool enable_peft_finetuning = false; - bool disable_peft_bwd = false; static bool inference_finished; // tree width in each speculative step, if not specified 1 diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index e3503d98ee..f800b7f17c 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -50,9 +50,7 @@ void parse_input_args(char **argv, float &topp, int &max_requests_per_batch, int &max_tokens_per_batch, - int &max_sequence_length, - bool &enable_peft_finetuning, - bool &disable_peft_bwd) { + int &max_sequence_length) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -126,14 +124,6 @@ void parse_input_args(char **argv, max_sequence_length = std::stoi(argv[++i]); continue; } - if (!strcmp(argv[i], "-enable-peft-finetuning")) { - enable_peft_finetuning = true; - continue; - } - if (!strcmp(argv[i], "-disable-peft-bwd")) { - disable_peft_bwd = true; - continue; - } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -165,7 +155,6 @@ void FlexFlow::top_level_task(Task const *task, int max_tokens_per_batch = 128; int max_sequence_length = 256; bool enable_peft_finetuning = true; - bool disable_peft_bwd = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -183,9 +172,7 @@ void FlexFlow::top_level_task(Task const *task, topp, max_requests_per_batch, max_tokens_per_batch, - max_sequence_length, - enable_peft_finetuning, - disable_peft_bwd); + max_sequence_length); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -264,7 +251,6 @@ void FlexFlow::top_level_task(Task const *task, model_type, bos_token_id, eos_token_id, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); - rm->set_disable_peft_bwd(disable_peft_bwd); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc index 72ebe87227..c0d7d33ae4 100644 --- a/inference/peft/peft_bwd_benchmark.cc +++ b/inference/peft/peft_bwd_benchmark.cc @@ -50,9 +50,7 @@ void parse_input_args(char **argv, int &max_requests_per_batch, int &max_tokens_per_batch, int &max_sequence_length, - int &max_requests_to_run, - bool &enable_peft_finetuning, - bool &disable_peft_bwd) { + int &max_requests_to_run) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -125,14 +123,6 @@ void parse_input_args(char **argv, max_requests_to_run = std::stoi(argv[++i]); continue; } - if (!strcmp(argv[i], "-enable-peft-finetuning")) { - enable_peft_finetuning = true; - continue; - } - if (!strcmp(argv[i], "-disable-peft-bwd")) { - disable_peft_bwd = true; - continue; - } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -165,7 +155,6 @@ void FlexFlow::top_level_task(Task const *task, int max_sequence_length = 256; int max_requests_to_run = 1000000000; bool enable_peft_finetuning = false; - bool disable_peft_bwd = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -184,9 +173,7 @@ void FlexFlow::top_level_task(Task const *task, max_requests_per_batch, max_tokens_per_batch, max_sequence_length, - max_requests_to_run, - enable_peft_finetuning, - disable_peft_bwd); + max_requests_to_run); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -265,7 +252,6 @@ void FlexFlow::top_level_task(Task const *task, model_type, bos_token_id, eos_token_id, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); - rm->set_disable_peft_bwd(disable_peft_bwd); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { @@ -366,7 +352,7 @@ void FlexFlow::top_level_task(Task const *task, lengths.push_back(prompt_length); index++; } - printf("Total number of finetuning requests: %d", lengths.size()); + printf("Total number of finetuning requests: %ld", lengths.size()); // Add fine-tuning requests for (int i = 0; i < lengths.size(); i++) { diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc index 215b2f80f4..7be90e083a 100644 --- a/inference/peft/peft_fwd_benchmark.cc +++ b/inference/peft/peft_fwd_benchmark.cc @@ -50,9 +50,7 @@ void parse_input_args(char **argv, int &max_requests_per_batch, int &max_tokens_per_batch, int &max_sequence_length, - int &max_requests_to_run, - bool &enable_peft_finetuning, - bool &disable_peft_bwd) { + int &max_requests_to_run) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -125,14 +123,6 @@ void parse_input_args(char **argv, max_requests_to_run = std::stoi(argv[++i]); continue; } - if (!strcmp(argv[i], "-enable-peft-finetuning")) { - enable_peft_finetuning = true; - continue; - } - if (!strcmp(argv[i], "-disable-peft-bwd")) { - disable_peft_bwd = true; - continue; - } } if (paths.cache_folder_path.empty()) { paths.cache_folder_path = "~/.cache/flexflow"; @@ -165,7 +155,6 @@ void FlexFlow::top_level_task(Task const *task, int max_sequence_length = 256; int max_requests_to_run = 1000000000; bool enable_peft_finetuning = false; - bool disable_peft_bwd = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -184,9 +173,7 @@ void FlexFlow::top_level_task(Task const *task, max_requests_per_batch, max_tokens_per_batch, max_sequence_length, - max_requests_to_run, - enable_peft_finetuning, - disable_peft_bwd); + max_requests_to_run); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -265,7 +252,6 @@ void FlexFlow::top_level_task(Task const *task, model_type, bos_token_id, eos_token_id, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); - rm->set_disable_peft_bwd(disable_peft_bwd); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { @@ -339,7 +325,7 @@ void FlexFlow::top_level_task(Task const *task, prompts.push_back(std::make_pair(prompt_length, sequence_length)); index++; } - printf("Total number of prompts: %d", prompts.size()); + printf("Total number of prompts: %ld", prompts.size()); for (auto &prompt : prompts) { // printf("Prompt length: %d, sequence length: %d\n", prompt_length, // sequence_length); diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc index 08b087faed..3824b93840 100644 --- a/inference/peft/req_rate_benchmark.cc +++ b/inference/peft/req_rate_benchmark.cc @@ -106,8 +106,6 @@ void parse_input_args(char **argv, int &max_tokens_per_batch, int &max_sequence_length, int &max_buckets_to_run, - bool &enable_peft_finetuning, - bool &disable_peft_bwd, int &bucket_timeframe) { for (int i = 1; i < argc; i++) { // llm model type @@ -181,14 +179,6 @@ void parse_input_args(char **argv, max_buckets_to_run = std::stoi(argv[++i]); continue; } - if (!strcmp(argv[i], "-enable-peft-finetuning")) { - enable_peft_finetuning = true; - continue; - } - if (!strcmp(argv[i], "-disable-peft-bwd")) { - disable_peft_bwd = true; - continue; - } if (!strcmp(argv[i], "--bucket-timeframe")) { bucket_timeframe = std::stoi(argv[++i]); continue; @@ -227,7 +217,6 @@ void FlexFlow::top_level_task(Task const *task, int max_sequence_length = 256; int max_buckets_to_run = 1000000000; bool enable_peft_finetuning = false; - bool disable_peft_bwd = false; int bucket_timespan = 1; InputArgs const &command_args = HighLevelRuntime::get_input_args(); @@ -248,8 +237,6 @@ void FlexFlow::top_level_task(Task const *task, max_tokens_per_batch, max_sequence_length, max_buckets_to_run, - enable_peft_finetuning, - disable_peft_bwd, bucket_timespan); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == @@ -329,7 +316,6 @@ void FlexFlow::top_level_task(Task const *task, model_type, bos_token_id, eos_token_id, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); - rm->set_disable_peft_bwd(disable_peft_bwd); FFModel model(ffconfig, ffconfig.cpu_offload); if (model_type == ModelType::LLAMA) { diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index 38a25fb614..caf7ce1774 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -109,6 +109,7 @@ def main(): ) llm.compile( generation_config, + enable_peft_finetuning = (len(configs.finetuning_dataset) > 0), max_requests_per_batch=1, max_seq_length=256, max_tokens_per_batch=64, diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index b08fdba072..ec4cacfa6d 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1612,6 +1612,10 @@ def set_max_spec_tree_token_num(self, max_tokens): def set_max_sequence_length(self, max_length): return ffc().flexflow_request_manager_set_max_sequence_length( self.handle, max_length) + + def set_enable_peft_finetuning(self, enable_peft_finetuning): + return ffc().flexflow_request_manager_set_enable_peft_finetuning( + self.handle, enable_peft_finetuning) def start_server(self, model): return ffc().flexflow_request_manager_start_background_server( diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 1956946380..248fe55d93 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -349,6 +349,7 @@ def compile( max_requests_per_batch: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, + enable_peft_finetuning: bool = False, model_specific_data_parallelism_degree: int = None, model_specific_tensor_parallelism_degree: int = None, model_specific_pipeline_parallelism_degree: int = None, @@ -364,6 +365,8 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 :type max_tokens_per_batch: int, optional + :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False + :type enable_peft_finetuning: bool, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None :type model_specific_data_parallelism_degree: int, optional :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None @@ -373,9 +376,6 @@ def compile( :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ - # self.max_requests_per_batch = max_requests_per_batch - # self.max_seq_length = max_seq_length - # self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.generation_config = GenerationConfig() self.ffconfig = FFConfig() @@ -407,6 +407,7 @@ def compile( self.rm.set_max_requests_per_batch(max_requests_per_batch) self.rm.set_max_tokens_per_batch(max_tokens_per_batch) self.rm.set_max_sequence_length(max_seq_length) + self.rm.set_enable_peft_finetuning(enable_peft_finetuning) # Instantiate the relevant model self.model = self.model_class( @@ -560,15 +561,13 @@ def compile( max_requests_per_batch: int = 16, max_seq_length: int = 256, max_tokens_per_batch: int = 128, + enable_peft_finetuning: bool = False, model_specific_data_parallelism_degree: int = 1, model_specific_tensor_parallelism_degree: int = 1, model_specific_pipeline_parallelism_degree: int = 1, ssms: list = [], ): """Compile the SSM for inference and load the weights into memory - - :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE - :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional :param max_requests_per_batch: The maximum batch size to allow, defaults to 16 @@ -577,6 +576,8 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128 :type max_tokens_per_batch: int, optional + :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False + :type enable_peft_finetuning: bool, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 :type model_specific_data_parallelism_degree: int, optional :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1 @@ -591,6 +592,7 @@ def compile( max_requests_per_batch, max_seq_length, max_tokens_per_batch, + enable_peft_finetuning, model_specific_data_parallelism_degree, model_specific_tensor_parallelism_degree, model_specific_pipeline_parallelism_degree, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 3a6c18aa7b..993d1b6a0d 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2662,6 +2662,14 @@ void flexflow_request_manager_set_max_sequence_length( DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length); } +void flexflow_request_manager_set_enable_peft_finetuning( + flexflow_request_manager_t handle_, bool enable_peft_finetuning_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_enable_peft_finetuning(enable_peft_finetuning_); + DEBUG_PRINT("[RequestManager] set_enable_peft_finetuning %d", + enable_peft_finetuning_); +} + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 6a4d9658e0..e3c6e7c6f3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -166,10 +166,6 @@ void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) { enable_peft_finetuning = enable_peft_finetuning_; } -void RequestManager::set_disable_peft_bwd(bool disable_peft_bwd_) { - disable_peft_bwd = disable_peft_bwd_; -} - void RequestManager::set_inference_finished(bool finished) { inference_finished = finished; } @@ -2846,7 +2842,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime); FutureMap fm = im->inference(llm, 0, bcf); - if (llm->config.enable_peft && !disable_peft_bwd) { + if (llm->config.enable_peft) { im->peft_bwd(llm, 0, bcf); } assert(fm.get_future_map_domain().get_volume() == 1); diff --git a/tests/peft_test.sh b/tests/peft_test.sh index b32b69cd82..a5892fd59d 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -29,13 +29,13 @@ python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full -- # CPP test ../build/inference/peft/peft \ - -ll:gpu 1 -ll:cpu 4 -ll:util 4 \ + -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 4 \ -ll:fsize 8192 -ll:zsize 12000 \ -llm-model JackFram/llama-160m \ -finetuning-dataset ../inference/prompt/peft_dataset.json \ -peft-model goliaro/llama-160m-lora-full \ --use-full-precision \ - --inference-debugging \ --fusion \ -enable-peft From 985c2548aef8f2257bf486307dd66909ba83d7de Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 1 May 2024 22:49:30 +0000 Subject: [PATCH 176/198] add peft tests to ci --- .github/workflows/gpu-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 7bdb6805a8..b5260ead05 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -174,6 +174,9 @@ jobs: # Inference tests source ./build/set_python_envs.sh ./tests/inference_tests.sh + + # PEFT tests + ./tests/peft_tests.sh - name: Save inference output as an artifact if: always() From f033b4e1860dad8d904322be7aea8cac308d5a50 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 8 May 2024 19:07:30 +0000 Subject: [PATCH 177/198] shellcheck --- tests/peft_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/peft_test.sh b/tests/peft_test.sh index a5892fd59d..6e6147bbb0 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -14,7 +14,7 @@ fi # Create test prompt file mkdir -p ../inference/prompt echo '["Two things are infinite: "]' > ../inference/prompt/peft.json -echo "[\"“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.“\"]" > ../inference/prompt/peft_dataset.json +echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.“"]' > ../inference/prompt/peft_dataset.json # Create output folder From 10119279e408d46c92c071bfee099d8035a7ea03 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 9 May 2024 22:19:57 +0000 Subject: [PATCH 178/198] fix --- src/runtime/file_loader.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index fd31f21b26..c373e0da9b 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -769,6 +769,10 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (weight_filename != "embed_tokens_weight_lm_head") { weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = + join_path({weights_folder, weight_filename}); + load_from_file(data, volume, weight_filepath); } } From 9064c2ba40cde8b98fd765a6d5cf58df2754cd90 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 9 May 2024 22:22:46 +0000 Subject: [PATCH 179/198] fix python requirements --- conda/flexflow.yml | 1 + docker/flexflow-environment/Dockerfile | 2 +- requirements.txt | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 89421db758..091ba929e4 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -30,4 +30,5 @@ dependencies: - datasets - accelerate - loralib + - triton - peft diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 84ee157302..fb4ea0ef75 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -94,7 +94,7 @@ RUN conda install pytorch torchvision torchaudio -c pytorch RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook # PEFT-related -RUN pip3 install scipy bitsandbytes datasets accelerate loralib peft +RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft # Install Rust RUN curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/requirements.txt b/requirements.txt index 43df6a2975..f408ce7e06 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ bitsandbytes datasets accelerate loralib +triton peft From a125e86090ea09e3a10d607a6e485191eb5751eb Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 10 May 2024 00:19:58 +0000 Subject: [PATCH 180/198] fix --- src/ops/lora_linear.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 170e087226..95c60d2531 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -106,6 +106,10 @@ PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { 1 /*outputs*/, input, output); + // fix LoRA layer's transformer layer ID and model ID + peft_layer->layer_guid.transformer_layer_id = + target_module->layer_guid.transformer_layer_id; + peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; { int numdims = output->num_dims; int dims[MAX_TENSOR_DIM]; From d74fe53ef66848003367ee5c7518875f96a77f80 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 11 May 2024 00:09:24 +0000 Subject: [PATCH 181/198] fix --- inference/python/ff_peft.py | 2 +- .../alignment/llama_alignment_tests.ipynb | 560 +++++++++++++++--- tests/peft/hf_finetune.py | 2 +- tests/peft_test.sh | 30 +- 4 files changed, 500 insertions(+), 94 deletions(-) diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index caf7ce1774..657748c6a9 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -65,7 +65,7 @@ def get_configs(): # required parameters "base_model": "JackFram/llama-160m", "peft_model_ids": [ - "goliaro/llama-160m-lora-full", + "goliaro/llama-160m-lora", ], # optional parameters "cache_path": "", diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb index 414280cff5..868dad18e3 100644 --- a/tests/peft/alignment/llama_alignment_tests.ipynb +++ b/tests/peft/alignment/llama_alignment_tests.ipynb @@ -15,6 +15,30 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/FlexFlow/tests/peft/hf_peft_tensors /usr/FlexFlow/build/inference_tensors\n" + ] + } + ], + "source": [ + "print(hf_path, ff_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check weights (semi-automatically)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -81,7 +105,427 @@ "Ok!\n", "Ok!\n", "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "files_list = os.listdir(hf_path)\n", + "num_layers=12\n", + "for f in sorted(files_list):\n", + " if f.endswith(\".weight\"):\n", + " if \"self_attn\" in f:\n", + " continue\n", + " if f.endswith(\".lm_head.weight\"):\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n", + " elif f == \"norm.weight\":\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n", + " else:\n", + " f_version = \"fwd_step_0_\"\n", + " if f.startswith(\"layers.\"):\n", + " layernum = f.split(\"layers.\")[1].split(\".\")[0]\n", + " f_version += f\"layers_{layernum}_\"\n", + " f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n", + " weight_index=\"0\"\n", + " if \"lora_A\" in f_version:\n", + " weight_index=\"A\"\n", + " elif \"lora_B\" in f_version:\n", + " weight_index=\"B\"\n", + " f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n", + " f_version += f\"_shard_0_weight_{weight_index}\"\n", + " # print(f, f_version)\n", + " hf_w_path = os.path.join(hf_path, f)\n", + " ff_w_path = os.path.join(ff_path, f_version)\n", + " assert(os.path.isfile(hf_w_path))\n", + " assert(os.path.isfile(ff_w_path))\n", + " # print(\"\\t\", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path))\n", + " # print(\"\\t\", ff_w_path)\n", + "\n", + " # check equivalence\n", + " compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load model for automatic check" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "from peft import PeftModel, PeftConfig\n", + "use_full_precision=True\n", + "peft_model_id=\"goliaro/llama-160m-lora\"\n", + "peft_config = PeftConfig.from_pretrained(peft_model_id)\n", + "if peft_config.peft_type != \"LORA\":\n", + " raise ValueError(f\"PEFT type {peft_config.peft_type} not supported yet\")\n", + "\n", + "peft_config.init_lora_weights = (\n", + " False\n", + ") # prevent HF from re-inizialing the weights randomly\n", + "model_name = peft_config.base_model_name_or_path\n", + "# Load base model, and apply the PEFT layer\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " torch_dtype=torch.float32 if use_full_precision else torch.float16,\n", + " device_map=\"auto\",\n", + ")\n", + "model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "embed_tokens\n", + "layers\n", + "layers.0\n", + "layers.0.self_attn\n", + "layers.0.self_attn.q_proj\n", + "layers.0.self_attn.k_proj\n", + "layers.0.self_attn.v_proj\n", + "layers.0.self_attn.o_proj\n", + "layers.0.self_attn.rotary_emb\n", + "layers.0.mlp\n", + "layers.0.mlp.gate_proj\n", + "layers.0.mlp.up_proj\n", + "layers.0.mlp.down_proj\n", + "layers.0.mlp.down_proj.base_layer\n", + "layers.0.mlp.down_proj.lora_dropout\n", + "layers.0.mlp.down_proj.lora_dropout.default\n", + "layers.0.mlp.down_proj.lora_A\n", + "layers.0.mlp.down_proj.lora_A.default\n", + "layers.0.mlp.down_proj.lora_B\n", + "layers.0.mlp.down_proj.lora_B.default\n", + "layers.0.mlp.down_proj.lora_embedding_A\n", + "layers.0.mlp.down_proj.lora_embedding_B\n", + "layers.0.mlp.act_fn\n", + "layers.0.input_layernorm\n", + "layers.0.post_attention_layernorm\n", + "layers.1\n", + "layers.1.self_attn\n", + "layers.1.self_attn.q_proj\n", + "layers.1.self_attn.k_proj\n", + "layers.1.self_attn.v_proj\n", + "layers.1.self_attn.o_proj\n", + "layers.1.self_attn.rotary_emb\n", + "layers.1.mlp\n", + "layers.1.mlp.gate_proj\n", + "layers.1.mlp.up_proj\n", + "layers.1.mlp.down_proj\n", + "layers.1.mlp.down_proj.base_layer\n", + "layers.1.mlp.down_proj.lora_dropout\n", + "layers.1.mlp.down_proj.lora_dropout.default\n", + "layers.1.mlp.down_proj.lora_A\n", + "layers.1.mlp.down_proj.lora_A.default\n", + "layers.1.mlp.down_proj.lora_B\n", + "layers.1.mlp.down_proj.lora_B.default\n", + "layers.1.mlp.down_proj.lora_embedding_A\n", + "layers.1.mlp.down_proj.lora_embedding_B\n", + "layers.1.mlp.act_fn\n", + "layers.1.input_layernorm\n", + "layers.1.post_attention_layernorm\n", + "layers.2\n", + "layers.2.self_attn\n", + "layers.2.self_attn.q_proj\n", + "layers.2.self_attn.k_proj\n", + "layers.2.self_attn.v_proj\n", + "layers.2.self_attn.o_proj\n", + "layers.2.self_attn.rotary_emb\n", + "layers.2.mlp\n", + "layers.2.mlp.gate_proj\n", + "layers.2.mlp.up_proj\n", + "layers.2.mlp.down_proj\n", + "layers.2.mlp.down_proj.base_layer\n", + "layers.2.mlp.down_proj.lora_dropout\n", + "layers.2.mlp.down_proj.lora_dropout.default\n", + "layers.2.mlp.down_proj.lora_A\n", + "layers.2.mlp.down_proj.lora_A.default\n", + "layers.2.mlp.down_proj.lora_B\n", + "layers.2.mlp.down_proj.lora_B.default\n", + "layers.2.mlp.down_proj.lora_embedding_A\n", + "layers.2.mlp.down_proj.lora_embedding_B\n", + "layers.2.mlp.act_fn\n", + "layers.2.input_layernorm\n", + "layers.2.post_attention_layernorm\n", + "layers.3\n", + "layers.3.self_attn\n", + "layers.3.self_attn.q_proj\n", + "layers.3.self_attn.k_proj\n", + "layers.3.self_attn.v_proj\n", + "layers.3.self_attn.o_proj\n", + "layers.3.self_attn.rotary_emb\n", + "layers.3.mlp\n", + "layers.3.mlp.gate_proj\n", + "layers.3.mlp.up_proj\n", + "layers.3.mlp.down_proj\n", + "layers.3.mlp.down_proj.base_layer\n", + "layers.3.mlp.down_proj.lora_dropout\n", + "layers.3.mlp.down_proj.lora_dropout.default\n", + "layers.3.mlp.down_proj.lora_A\n", + "layers.3.mlp.down_proj.lora_A.default\n", + "layers.3.mlp.down_proj.lora_B\n", + "layers.3.mlp.down_proj.lora_B.default\n", + "layers.3.mlp.down_proj.lora_embedding_A\n", + "layers.3.mlp.down_proj.lora_embedding_B\n", + "layers.3.mlp.act_fn\n", + "layers.3.input_layernorm\n", + "layers.3.post_attention_layernorm\n", + "layers.4\n", + "layers.4.self_attn\n", + "layers.4.self_attn.q_proj\n", + "layers.4.self_attn.k_proj\n", + "layers.4.self_attn.v_proj\n", + "layers.4.self_attn.o_proj\n", + "layers.4.self_attn.rotary_emb\n", + "layers.4.mlp\n", + "layers.4.mlp.gate_proj\n", + "layers.4.mlp.up_proj\n", + "layers.4.mlp.down_proj\n", + "layers.4.mlp.down_proj.base_layer\n", + "layers.4.mlp.down_proj.lora_dropout\n", + "layers.4.mlp.down_proj.lora_dropout.default\n", + "layers.4.mlp.down_proj.lora_A\n", + "layers.4.mlp.down_proj.lora_A.default\n", + "layers.4.mlp.down_proj.lora_B\n", + "layers.4.mlp.down_proj.lora_B.default\n", + "layers.4.mlp.down_proj.lora_embedding_A\n", + "layers.4.mlp.down_proj.lora_embedding_B\n", + "layers.4.mlp.act_fn\n", + "layers.4.input_layernorm\n", + "layers.4.post_attention_layernorm\n", + "layers.5\n", + "layers.5.self_attn\n", + "layers.5.self_attn.q_proj\n", + "layers.5.self_attn.k_proj\n", + "layers.5.self_attn.v_proj\n", + "layers.5.self_attn.o_proj\n", + "layers.5.self_attn.rotary_emb\n", + "layers.5.mlp\n", + "layers.5.mlp.gate_proj\n", + "layers.5.mlp.up_proj\n", + "layers.5.mlp.down_proj\n", + "layers.5.mlp.down_proj.base_layer\n", + "layers.5.mlp.down_proj.lora_dropout\n", + "layers.5.mlp.down_proj.lora_dropout.default\n", + "layers.5.mlp.down_proj.lora_A\n", + "layers.5.mlp.down_proj.lora_A.default\n", + "layers.5.mlp.down_proj.lora_B\n", + "layers.5.mlp.down_proj.lora_B.default\n", + "layers.5.mlp.down_proj.lora_embedding_A\n", + "layers.5.mlp.down_proj.lora_embedding_B\n", + "layers.5.mlp.act_fn\n", + "layers.5.input_layernorm\n", + "layers.5.post_attention_layernorm\n", + "layers.6\n", + "layers.6.self_attn\n", + "layers.6.self_attn.q_proj\n", + "layers.6.self_attn.k_proj\n", + "layers.6.self_attn.v_proj\n", + "layers.6.self_attn.o_proj\n", + "layers.6.self_attn.rotary_emb\n", + "layers.6.mlp\n", + "layers.6.mlp.gate_proj\n", + "layers.6.mlp.up_proj\n", + "layers.6.mlp.down_proj\n", + "layers.6.mlp.down_proj.base_layer\n", + "layers.6.mlp.down_proj.lora_dropout\n", + "layers.6.mlp.down_proj.lora_dropout.default\n", + "layers.6.mlp.down_proj.lora_A\n", + "layers.6.mlp.down_proj.lora_A.default\n", + "layers.6.mlp.down_proj.lora_B\n", + "layers.6.mlp.down_proj.lora_B.default\n", + "layers.6.mlp.down_proj.lora_embedding_A\n", + "layers.6.mlp.down_proj.lora_embedding_B\n", + "layers.6.mlp.act_fn\n", + "layers.6.input_layernorm\n", + "layers.6.post_attention_layernorm\n", + "layers.7\n", + "layers.7.self_attn\n", + "layers.7.self_attn.q_proj\n", + "layers.7.self_attn.k_proj\n", + "layers.7.self_attn.v_proj\n", + "layers.7.self_attn.o_proj\n", + "layers.7.self_attn.rotary_emb\n", + "layers.7.mlp\n", + "layers.7.mlp.gate_proj\n", + "layers.7.mlp.up_proj\n", + "layers.7.mlp.down_proj\n", + "layers.7.mlp.down_proj.base_layer\n", + "layers.7.mlp.down_proj.lora_dropout\n", + "layers.7.mlp.down_proj.lora_dropout.default\n", + "layers.7.mlp.down_proj.lora_A\n", + "layers.7.mlp.down_proj.lora_A.default\n", + "layers.7.mlp.down_proj.lora_B\n", + "layers.7.mlp.down_proj.lora_B.default\n", + "layers.7.mlp.down_proj.lora_embedding_A\n", + "layers.7.mlp.down_proj.lora_embedding_B\n", + "layers.7.mlp.act_fn\n", + "layers.7.input_layernorm\n", + "layers.7.post_attention_layernorm\n", + "layers.8\n", + "layers.8.self_attn\n", + "layers.8.self_attn.q_proj\n", + "layers.8.self_attn.k_proj\n", + "layers.8.self_attn.v_proj\n", + "layers.8.self_attn.o_proj\n", + "layers.8.self_attn.rotary_emb\n", + "layers.8.mlp\n", + "layers.8.mlp.gate_proj\n", + "layers.8.mlp.up_proj\n", + "layers.8.mlp.down_proj\n", + "layers.8.mlp.down_proj.base_layer\n", + "layers.8.mlp.down_proj.lora_dropout\n", + "layers.8.mlp.down_proj.lora_dropout.default\n", + "layers.8.mlp.down_proj.lora_A\n", + "layers.8.mlp.down_proj.lora_A.default\n", + "layers.8.mlp.down_proj.lora_B\n", + "layers.8.mlp.down_proj.lora_B.default\n", + "layers.8.mlp.down_proj.lora_embedding_A\n", + "layers.8.mlp.down_proj.lora_embedding_B\n", + "layers.8.mlp.act_fn\n", + "layers.8.input_layernorm\n", + "layers.8.post_attention_layernorm\n", + "layers.9\n", + "layers.9.self_attn\n", + "layers.9.self_attn.q_proj\n", + "layers.9.self_attn.k_proj\n", + "layers.9.self_attn.v_proj\n", + "layers.9.self_attn.o_proj\n", + "layers.9.self_attn.rotary_emb\n", + "layers.9.mlp\n", + "layers.9.mlp.gate_proj\n", + "layers.9.mlp.up_proj\n", + "layers.9.mlp.down_proj\n", + "layers.9.mlp.down_proj.base_layer\n", + "layers.9.mlp.down_proj.lora_dropout\n", + "layers.9.mlp.down_proj.lora_dropout.default\n", + "layers.9.mlp.down_proj.lora_A\n", + "layers.9.mlp.down_proj.lora_A.default\n", + "layers.9.mlp.down_proj.lora_B\n", + "layers.9.mlp.down_proj.lora_B.default\n", + "layers.9.mlp.down_proj.lora_embedding_A\n", + "layers.9.mlp.down_proj.lora_embedding_B\n", + "layers.9.mlp.act_fn\n", + "layers.9.input_layernorm\n", + "layers.9.post_attention_layernorm\n", + "layers.10\n", + "layers.10.self_attn\n", + "layers.10.self_attn.q_proj\n", + "layers.10.self_attn.k_proj\n", + "layers.10.self_attn.v_proj\n", + "layers.10.self_attn.o_proj\n", + "layers.10.self_attn.rotary_emb\n", + "layers.10.mlp\n", + "layers.10.mlp.gate_proj\n", + "layers.10.mlp.up_proj\n", + "layers.10.mlp.down_proj\n", + "layers.10.mlp.down_proj.base_layer\n", + "layers.10.mlp.down_proj.lora_dropout\n", + "layers.10.mlp.down_proj.lora_dropout.default\n", + "layers.10.mlp.down_proj.lora_A\n", + "layers.10.mlp.down_proj.lora_A.default\n", + "layers.10.mlp.down_proj.lora_B\n", + "layers.10.mlp.down_proj.lora_B.default\n", + "layers.10.mlp.down_proj.lora_embedding_A\n", + "layers.10.mlp.down_proj.lora_embedding_B\n", + "layers.10.mlp.act_fn\n", + "layers.10.input_layernorm\n", + "layers.10.post_attention_layernorm\n", + "layers.11\n", + "layers.11.self_attn\n", + "layers.11.self_attn.q_proj\n", + "layers.11.self_attn.k_proj\n", + "layers.11.self_attn.v_proj\n", + "layers.11.self_attn.o_proj\n", + "layers.11.self_attn.rotary_emb\n", + "layers.11.mlp\n", + "layers.11.mlp.gate_proj\n", + "layers.11.mlp.up_proj\n", + "layers.11.mlp.down_proj\n", + "layers.11.mlp.down_proj.base_layer\n", + "layers.11.mlp.down_proj.lora_dropout\n", + "layers.11.mlp.down_proj.lora_dropout.default\n", + "layers.11.mlp.down_proj.lora_A\n", + "layers.11.mlp.down_proj.lora_A.default\n", + "layers.11.mlp.down_proj.lora_B\n", + "layers.11.mlp.down_proj.lora_B.default\n", + "layers.11.mlp.down_proj.lora_embedding_A\n", + "layers.11.mlp.down_proj.lora_embedding_B\n", + "layers.11.mlp.act_fn\n", + "layers.11.input_layernorm\n", + "layers.11.post_attention_layernorm\n", + "norm\n" + ] + } + ], + "source": [ + "named_modules = [name.replace(\"base_model.model.model.\", \"\") for name, _ in model.named_modules() if \"base_model.model.model.\" in name]\n", + "for x in named_modules:\n", + " print(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manual check" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "hf_embed_input= \"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.input_0\"\n", + "ff_embed_input=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n", + "compare_tensors(hf_embed_input, ff_embed_input)\n", + "hf_embed_output=\"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.output_0\"\n", + "ff_embed_output=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n", + "compare_tensors(hf_embed_output, ff_embed_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ "Ok!\n", "Ok!\n", "Ok!\n", @@ -91,116 +535,58 @@ "Ok!\n", "Ok!\n", "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n", - "Ok!\n" + "/usr/FlexFlow/tests/peft/hf_peft_tensors/layers.0.mlp.down_proj.lora_A.default.weight True\n", + "/usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers_0_feed_forward_w2_lora_shard_0_weight_A False\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[20], line 40\u001b[0m\n\u001b[1;32m 38\u001b[0m hf_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 39\u001b[0m ff_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_A\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 40\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_lora_A_weight_fp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_lora_A_weight_fp\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 41\u001b[0m hf_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_B.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 42\u001b[0m ff_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_B\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/usr/FlexFlow/tests/peft/alignment/align_test_utils.py:24\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(hf_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(hf_tensor_filepath))\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(ff_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(ff_tensor_filepath))\n\u001b[0;32m---> 24\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 25\u001b[0m hf_tensor \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_tensor_filepath)\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n", + "\u001b[0;31mAssertionError\u001b[0m: " ] } ], "source": [ "tot_num_layers = 12\n", "for i in range(tot_num_layers):\n", + " hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n", + " ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n", + " compare_tensors(hf_input_ln_in, ff_input_ln_in)\n", " hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n", - " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_RMSNorm_shard_0_output_0\"\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n", " if i > 0:\n", " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_norm_shard_0_output_1\"\n", " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", " hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n", - " ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n", + " ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n", " compare_tensors(hf_attn_out, ff_attn_out)\n", " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n", - " ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_1\"\n", + " ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n", " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", " # w1\n", " hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", - " ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n", + " ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n", " compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n", " # w3\n", " hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n", - " ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n", + " ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_0_layers.0.mlp.up_proj_shard_0_output_0\"\n", " compare_tensors(hf_up_proj_out, ff_up_proj_out)\n", " # w2\n", " hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n", " hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n", - " ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n", - " ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n", + " ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n", + " ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n", " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", " # LORA input\n", " hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n", - " ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n", + " ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0\"\n", " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", " # LORA weights\n", @@ -234,7 +620,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -292,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -375,7 +761,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -2031,7 +2417,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 1e0e0bd167..cccb7cf11c 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -151,7 +151,7 @@ def peft_forward_hook(module, input, output): def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--peft-model-id", type=str, default="goliaro/llama-160m-lora-full" + "--peft-model-id", type=str, default="goliaro/llama-160m-lora" ) parser.add_argument("--lora-alpha", type=int, default=16) parser.add_argument("--lora-dropout", type=float, default=0.0) diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 6e6147bbb0..bf9ca816e7 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -14,17 +14,17 @@ fi # Create test prompt file mkdir -p ../inference/prompt echo '["Two things are infinite: "]' > ../inference/prompt/peft.json -echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.“"]' > ../inference/prompt/peft_dataset.json +echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.”"]' > ../inference/prompt/peft_dataset.json # Create output folder mkdir -p ../inference/output # Enable backtrace in case we run into a segfault or assertion failure -export LEGION_BACKTRACE=1 +# export LEGION_BACKTRACE=1 # Download test model -python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full --base_model_name JackFram/llama-160m +python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m # if first time, add: --refresh-cache # CPP test @@ -34,10 +34,30 @@ python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora-full -- -ll:fsize 8192 -ll:zsize 12000 \ -llm-model JackFram/llama-160m \ -finetuning-dataset ../inference/prompt/peft_dataset.json \ - -peft-model goliaro/llama-160m-lora-full \ + -peft-model goliaro/llama-160m-lora \ --use-full-precision \ --fusion \ -enable-peft -# Python test +Python test python ../inference/python/ff_peft.py + +# cd ../build +# rm -rf inference_tensors || true +# ./inference/peft/peft \ +# -ll:gpu 1 -ll:cpu 4 -ll:util 4 \ +# -tensor-parallelism-degree 1 \ +# -ll:fsize 8192 -ll:zsize 12000 \ +# -llm-model JackFram/llama-160m \ +# -finetuning-dataset ../inference/prompt/peft_dataset.json \ +# -peft-model goliaro/llama-160m-lora \ +# -enable-peft \ +# --use-full-precision \ +# --inference-debugging +# rm -rf inference_tensors/bwd_* + +# cd ../tests/peft +# rm -rf hf_peft_tensors || true +# python hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision +# rm -rf hf_peft_tensors/bwd_* + From 0c6ae097bf2d61e508c1d13e777271b292795a74 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 17 May 2024 20:30:14 +0000 Subject: [PATCH 182/198] update ci test --- .gitignore | 1 + tests/.gitignore | 1 - tests/peft/alignment/align_test_utils.py | 8 +- tests/peft/fine_tune.sh | 19 --- tests/peft/peft_alignment_test.py | 158 +++++++++++++++++++++++ tests/peft_test.sh | 52 ++++---- 6 files changed, 190 insertions(+), 49 deletions(-) delete mode 100644 tests/.gitignore delete mode 100755 tests/peft/fine_tune.sh create mode 100644 tests/peft/peft_alignment_test.py diff --git a/.gitignore b/.gitignore index 0642faa000..cc34c1a7b6 100644 --- a/.gitignore +++ b/.gitignore @@ -188,6 +188,7 @@ python/flexflow/version.txt inference_tensors hf_peft_tensors +lora_training_logs Untitled-1.ipynb Untitled-2.ipynb diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index f3732d54f4..0000000000 --- a/tests/.gitignore +++ /dev/null @@ -1 +0,0 @@ -inference/python_test_configs/*.json diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index dbe7a0be40..24da900fcb 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -18,10 +18,10 @@ def print_unique_files_list(dirname): files_list.remove(f) return sorted(files_list) def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2): - if not (os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)): - print(hf_tensor_filepath, os.path.exists(hf_tensor_filepath)) - print(ff_tensor_filepath, os.path.exists(ff_tensor_filepath)) - assert False + if not os.path.exists(hf_tensor_filepath): + raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found") + if not os.path.exists(ff_tensor_filepath): + raise FileNotFoundError(f"FF tensor file {ff_tensor_filepath} not found") hf_tensor = torch.load(hf_tensor_filepath) if type(hf_tensor) == tuple or type(hf_tensor) == list: assert(len(hf_tensor) == 1) diff --git a/tests/peft/fine_tune.sh b/tests/peft/fine_tune.sh deleted file mode 100755 index 309d87130a..0000000000 --- a/tests/peft/fine_tune.sh +++ /dev/null @@ -1,19 +0,0 @@ -#! /usr/bin/env bash -set -e -set -x - -# Cd into directory holding this script -cd "${BASH_SOURCE[0]%/*}" - -python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-7b-lora-full -python hf_train.py --model-name decapoda-research/llama-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-7b-lora-half -python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-160m-lora-full -python hf_train.py --model-name JackFram/llama-160m --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-160m-lora-half - -python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --use-full-precision --publish-peft-with-id goliaro/llama-2-7b-lora-full -python hf_train.py --model-name meta-llama/Llama-2-7b-hf --lora-target-modules down_proj --publish-peft-with-id goliaro/llama-2-7b-lora-half - -python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-6.7b-lora-full -python hf_train.py --model-name facebook/opt-6.7b --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-6.7b-lora-half -python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --use-full-precision --publish-peft-with-id goliaro/opt-125m-lora-full -python hf_train.py --model-name facebook/opt-125m --lora-target-modules fc2 --publish-peft-with-id goliaro/opt-125m-lora-half diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py new file mode 100644 index 0000000000..f07c65140b --- /dev/null +++ b/tests/peft/peft_alignment_test.py @@ -0,0 +1,158 @@ +import numpy as np +import os, torch +from alignment.align_test_utils import * + +def convert_hf_filename_to_ff_filename(f, num_layers=12): + if f.endswith(".lm_head.weight"): + f_version = f"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0" + elif f == "norm.weight": + f_version = f"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0" + else: + f_version = "fwd_step_0_" + if f.startswith("layers."): + layernum = f.split("layers.")[1].split(".")[0] + f_version += f"layers_{layernum}_" + f_version += f.split(".weight")[0].replace(".base_layer", "").replace(".default", "") + weight_index="0" + if "lora_A" in f_version: + weight_index="A" + elif "lora_B" in f_version: + weight_index="B" + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + f_version += f"_shard_0_weight_{weight_index}" + return f_version + +def check_weights_alignment(): + print("-- Weights alignment --") + files_list = os.listdir(hf_path) + num_layers=12 + for f in sorted(files_list): + if f.endswith(".weight"): + if "self_attn" in f: + continue + f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers) + # print(f, f_version) + hf_w_path = os.path.join(hf_path, f) + ff_w_path = os.path.join(ff_path, f_version) + assert(os.path.isfile(hf_w_path)) + assert(os.path.isfile(ff_w_path)) + # print("\t", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path)) + # print("\t", ff_w_path) + + # check equivalence + compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5) + +def check_fwd_pass(tot_num_layers = 12): + print("-- FWD pass --") + # Transfomer head + hf_embed_input= f"{hf_path}/fwd_step_0_embed_tokens.input_0" + ff_embed_input = f"{ff_path}/fwd_step_0_layers_0_embed_tokens_shard_0_input_0" + compare_tensors(hf_embed_input, ff_embed_input) + hf_embed_output = f"{hf_path}/fwd_step_0_embed_tokens.output_0" + ff_embed_output = f"{ff_path}/fwd_step_0_layers_0_embed_tokens_shard_0_output_0" + compare_tensors(hf_embed_output, ff_embed_output) + + # Transformers blocks + for i in range(tot_num_layers): + hf_input_ln_in = f"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0" + ff_input_ln_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0" + if i > 0: + ff_input_ln_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0" + compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5) + hf_input_ln_out = f"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0" + ff_input_ln_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0" + if i > 0: + ff_input_ln_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1" + compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5) + hf_attn_out = f"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0" + ff_attn_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0" + compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5) + hf_ffn_norm_out = f"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0" + ff_ffn_norm_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1" + compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5) + # w1 + hf_gate_proj_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0" + ff_gate_proj_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0" + compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5) + # w3 + hf_up_proj_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0" + ff_up_proj_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0" + compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5) + # w2 + hf_down_proj_in = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0" + hf_down_proj_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0" + ff_down_proj_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0" + ff_down_proj_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0" + compare_tensors(hf_down_proj_in, ff_down_proj_in) + # compare_tensors(hf_down_proj_out, ff_down_proj_out) + # LORA input + hf_lora_A_in = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0" + ff_lora_A_in = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0" + compare_hf_tensors(hf_down_proj_in, hf_lora_A_in) + compare_tensors(hf_lora_A_in, ff_lora_A_in) + # LORA weights + hf_lora_A_weight_fp = f"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight" + ff_lora_A_weight_fp = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A" + compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp) + hf_lora_B_weight_fp = f"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight" + ff_lora_B_weight_fp = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B" + compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp) + # LORA intermediate hf + hf_lora_A_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0" + hf_lora_B_in = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0" + compare_hf_tensors(hf_lora_A_out, hf_lora_B_in) + # LORA output + hf_lora_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0" + ff_lora_out = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0" + # compare_tensors(hf_lora_out, ff_lora_out) + # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out) + # compare_tensors(hf_down_proj_out, ff_lora_out) + compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out) + + + # After last layer only + hf_norm_out = f"{hf_path}/fwd_step_0_norm.output_0" + ff_norm_out = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1" + compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5) + hf_lm_head_out = f"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0" + ff_lm_head_out = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0" + compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5) + +def check_bwd_pass(tot_num_layers = 12): + # ff_BWD_softmax_in = f"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0" + print("-- LM head --") + hf_BWD_lm_head_out = f"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0" + ff_BWD_lm_head_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0" + compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5) + # compare weights + hf_lm_head_weight = f"{hf_path}/base_model.model.lm_head.weight" + ff_lm_head_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0" + compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5) + hf_BWD_lm_head_in = f"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0" + ff_BWD_lm_head_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0" + compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5) + # # Manually check the matmul + # ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',') + # ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F') + # ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F') + # print(ff_tensor_out.shape) + # print(ff_weight.shape) + # print(np.matmul(ff_weight, ff_tensor_out)) + # compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in) + # ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',') + print("-- Final Norm --") + hf_BWD_norm_out = f"{hf_path}/bwd_step_0_norm.go_0" + ff_BWD_norm_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0" + compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out) + compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out) + ff_BWD_norm_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0" + hf_FWD_norm_weight = f"{hf_path}/norm.weight" + compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5) + hf_BWD_norm_in = f"{hf_path}/bwd_step_0_norm.gi_0" + ff_BWD_norm_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1" + compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5) + +if __name__ == "__main__": + check_weights_alignment() + check_fwd_pass() + check_bwd_pass() diff --git a/tests/peft_test.sh b/tests/peft_test.sh index bf9ca816e7..219b82342a 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -27,37 +27,39 @@ mkdir -p ../inference/output python ../inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m # if first time, add: --refresh-cache -# CPP test -../build/inference/peft/peft \ - -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ - -tensor-parallelism-degree 4 \ - -ll:fsize 8192 -ll:zsize 12000 \ - -llm-model JackFram/llama-160m \ - -finetuning-dataset ../inference/prompt/peft_dataset.json \ - -peft-model goliaro/llama-160m-lora \ - --use-full-precision \ - --fusion \ - -enable-peft - -Python test -python ../inference/python/ff_peft.py - -# cd ../build -# rm -rf inference_tensors || true -# ./inference/peft/peft \ -# -ll:gpu 1 -ll:cpu 4 -ll:util 4 \ -# -tensor-parallelism-degree 1 \ +# # CPP test +# ../build/inference/peft/peft \ +# -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ +# -tensor-parallelism-degree 4 \ # -ll:fsize 8192 -ll:zsize 12000 \ # -llm-model JackFram/llama-160m \ # -finetuning-dataset ../inference/prompt/peft_dataset.json \ # -peft-model goliaro/llama-160m-lora \ -# -enable-peft \ # --use-full-precision \ -# --inference-debugging +# --fusion \ +# -enable-peft + +# # Python test +# python ../inference/python/ff_peft.py + +cd ../build +rm -rf inference_tensors || true +./inference/peft/peft \ + -ll:gpu 1 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 1 \ + -ll:fsize 8192 -ll:zsize 12000 \ + -llm-model JackFram/llama-160m \ + -finetuning-dataset ../inference/prompt/peft_dataset.json \ + -peft-model goliaro/llama-160m-lora \ + -enable-peft \ + --use-full-precision \ + --inference-debugging # rm -rf inference_tensors/bwd_* -# cd ../tests/peft -# rm -rf hf_peft_tensors || true -# python hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision +cd ../tests/peft +rm -rf hf_peft_tensors || true +python hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision # rm -rf hf_peft_tensors/bwd_* + +python peft_alignment_test.py From 93b6032b29f92e3be42aafea3f822722f6fbbeb4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 17 May 2024 20:30:51 +0000 Subject: [PATCH 183/198] update alignment doc --- .../alignment/llama_alignment_tests.ipynb | 892 +++++++++++------- 1 file changed, 559 insertions(+), 333 deletions(-) diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb index 868dad18e3..86a4ef76c4 100644 --- a/tests/peft/alignment/llama_alignment_tests.ipynb +++ b/tests/peft/alignment/llama_alignment_tests.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -110,29 +110,33 @@ } ], "source": [ + "def convert_hf_filename_to_ff_filename(f, num_layers=12):\n", + " if f.endswith(\".lm_head.weight\"):\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n", + " elif f == \"norm.weight\":\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n", + " else:\n", + " f_version = \"fwd_step_0_\"\n", + " if f.startswith(\"layers.\"):\n", + " layernum = f.split(\"layers.\")[1].split(\".\")[0]\n", + " f_version += f\"layers_{layernum}_\"\n", + " f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n", + " weight_index=\"0\"\n", + " if \"lora_A\" in f_version:\n", + " weight_index=\"A\"\n", + " elif \"lora_B\" in f_version:\n", + " weight_index=\"B\"\n", + " f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n", + " f_version += f\"_shard_0_weight_{weight_index}\"\n", + " return f_version\n", + "\n", "files_list = os.listdir(hf_path)\n", "num_layers=12\n", "for f in sorted(files_list):\n", " if f.endswith(\".weight\"):\n", " if \"self_attn\" in f:\n", " continue\n", - " if f.endswith(\".lm_head.weight\"):\n", - " f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n", - " elif f == \"norm.weight\":\n", - " f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n", - " else:\n", - " f_version = \"fwd_step_0_\"\n", - " if f.startswith(\"layers.\"):\n", - " layernum = f.split(\"layers.\")[1].split(\".\")[0]\n", - " f_version += f\"layers_{layernum}_\"\n", - " f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n", - " weight_index=\"0\"\n", - " if \"lora_A\" in f_version:\n", - " weight_index=\"A\"\n", - " elif \"lora_B\" in f_version:\n", - " weight_index=\"B\"\n", - " f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n", - " f_version += f\"_shard_0_weight_{weight_index}\"\n", + " f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers)\n", " # print(f, f_version)\n", " hf_w_path = os.path.join(hf_path, f)\n", " ff_w_path = os.path.join(ff_path, f_version)\n", @@ -154,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -192,299 +196,369 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "embed_tokens True True\n", + "layers.0.self_attn.q_proj True True\n", + "layers.0.self_attn.k_proj True True\n", + "layers.0.self_attn.v_proj True True\n", + "layers.0.self_attn.o_proj True True\n", + "layers.0.self_attn.rotary_emb True True\n", + "layers.0.mlp.gate_proj True True\n", + "layers.0.mlp.up_proj True True\n", + "layers.0.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.0.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_output_0\n", + "layers.0.input_layernorm True True\n", + "layers.0.post_attention_layernorm True True\n", + "layers.1.self_attn.q_proj True True\n", + "layers.1.self_attn.k_proj True True\n", + "layers.1.self_attn.v_proj True True\n", + "layers.1.self_attn.o_proj True True\n", + "layers.1.self_attn.rotary_emb True True\n", + "layers.1.mlp.gate_proj True True\n", + "layers.1.mlp.up_proj True True\n", + "layers.1.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.1.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_output_0\n", + "layers.1.input_layernorm True True\n", + "layers.1.post_attention_layernorm True True\n", + "layers.2.self_attn.q_proj True True\n", + "layers.2.self_attn.k_proj True True\n", + "layers.2.self_attn.v_proj True True\n", + "layers.2.self_attn.o_proj True True\n", + "layers.2.self_attn.rotary_emb True True\n", + "layers.2.mlp.gate_proj True True\n", + "layers.2.mlp.up_proj True True\n", + "layers.2.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.2.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_output_0\n", + "layers.2.input_layernorm True True\n", + "layers.2.post_attention_layernorm True True\n", + "layers.3.self_attn.q_proj True True\n", + "layers.3.self_attn.k_proj True True\n", + "layers.3.self_attn.v_proj True True\n", + "layers.3.self_attn.o_proj True True\n", + "layers.3.self_attn.rotary_emb True True\n", + "layers.3.mlp.gate_proj True True\n", + "layers.3.mlp.up_proj True True\n", + "layers.3.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.3.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_output_0\n", + "layers.3.input_layernorm True True\n", + "layers.3.post_attention_layernorm True True\n", + "layers.4.self_attn.q_proj True True\n", + "layers.4.self_attn.k_proj True True\n", + "layers.4.self_attn.v_proj True True\n", + "layers.4.self_attn.o_proj True True\n", + "layers.4.self_attn.rotary_emb True True\n", + "layers.4.mlp.gate_proj True True\n", + "layers.4.mlp.up_proj True True\n", + "layers.4.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.4.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_output_0\n", + "layers.4.input_layernorm True True\n", + "layers.4.post_attention_layernorm True True\n", + "layers.5.self_attn.q_proj True True\n", + "layers.5.self_attn.k_proj True True\n", + "layers.5.self_attn.v_proj True True\n", + "layers.5.self_attn.o_proj True True\n", + "layers.5.self_attn.rotary_emb True True\n", + "layers.5.mlp.gate_proj True True\n", + "layers.5.mlp.up_proj True True\n", + "layers.5.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.5.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_output_0\n", + "layers.5.input_layernorm True True\n", + "layers.5.post_attention_layernorm True True\n", + "layers.6.self_attn.q_proj True True\n", + "layers.6.self_attn.k_proj True True\n", + "layers.6.self_attn.v_proj True True\n", + "layers.6.self_attn.o_proj True True\n", + "layers.6.self_attn.rotary_emb True True\n", + "layers.6.mlp.gate_proj True True\n", + "layers.6.mlp.up_proj True True\n", + "layers.6.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.6.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_output_0\n", + "layers.6.input_layernorm True True\n", + "layers.6.post_attention_layernorm True True\n", + "layers.7.self_attn.q_proj True True\n", + "layers.7.self_attn.k_proj True True\n", + "layers.7.self_attn.v_proj True True\n", + "layers.7.self_attn.o_proj True True\n", + "layers.7.self_attn.rotary_emb True True\n", + "layers.7.mlp.gate_proj True True\n", + "layers.7.mlp.up_proj True True\n", + "layers.7.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.7.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_output_0\n", + "layers.7.input_layernorm True True\n", + "layers.7.post_attention_layernorm True True\n", + "layers.8.self_attn.q_proj True True\n", + "layers.8.self_attn.k_proj True True\n", + "layers.8.self_attn.v_proj True True\n", + "layers.8.self_attn.o_proj True True\n", + "layers.8.self_attn.rotary_emb True True\n", + "layers.8.mlp.gate_proj True True\n", + "layers.8.mlp.up_proj True True\n", + "layers.8.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.8.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_output_0\n", + "layers.8.input_layernorm True True\n", + "layers.8.post_attention_layernorm True True\n", + "layers.9.self_attn.q_proj True True\n", + "layers.9.self_attn.k_proj True True\n", + "layers.9.self_attn.v_proj True True\n", + "layers.9.self_attn.o_proj True True\n", + "layers.9.self_attn.rotary_emb True True\n", + "layers.9.mlp.gate_proj True True\n", + "layers.9.mlp.up_proj True True\n", + "layers.9.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.9.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_output_0\n", + "layers.9.input_layernorm True True\n", + "layers.9.post_attention_layernorm True True\n", + "layers.10.self_attn.q_proj True True\n", + "layers.10.self_attn.k_proj True True\n", + "layers.10.self_attn.v_proj True True\n", + "layers.10.self_attn.o_proj True True\n", + "layers.10.self_attn.rotary_emb True True\n", + "layers.10.mlp.gate_proj True True\n", + "layers.10.mlp.up_proj True True\n", + "layers.10.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.10.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_output_0\n", + "layers.10.input_layernorm True True\n", + "layers.10.post_attention_layernorm True True\n", + "layers.11.self_attn.q_proj True True\n", + "layers.11.self_attn.k_proj True True\n", + "layers.11.self_attn.v_proj True True\n", + "layers.11.self_attn.o_proj True True\n", + "layers.11.self_attn.rotary_emb True True\n", + "layers.11.mlp.gate_proj True True\n", + "layers.11.mlp.up_proj True True\n", + "layers.11.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.11.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_output_0\n", + "layers.11.input_layernorm True True\n", + "layers.11.post_attention_layernorm True True\n", + "norm True True\n", + "lm_head True True\n" + ] + } + ], + "source": [ + "named_modules_ = [\n", + " name.replace(\"base_model.model.model.\", \"\").replace(\"base_model.model.model\", \"\").replace(\"base_model.model.\", \"\").replace(\"base_model.model\", \"\").replace(\"base_model.\", \"\").replace(\"base_model\", \"\")\n", + " for name, _ in model.named_modules()\n", + "]\n", + "\n", + "def remove_prefixes(named_modules):\n", + " i = 0\n", + " while i < len(named_modules) - 1:\n", + " if named_modules[i + 1].startswith(named_modules[i]):\n", + " named_modules.pop(i)\n", + " else:\n", + " i += 1\n", + " return named_modules\n", + "named_modules = remove_prefixes(named_modules_)\n", + "\n", + "def convert_hf_module_name_to_ff_filenames(n, num_layers=12):\n", + " if n == \"embed_tokens\":\n", + " ff_in_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n", + " ff_out_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n", + " elif n == \"lm_head\" or n == \"norm\":\n", + " ff_in_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_input_0\"\n", + " ff_out_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_output_0\"\n", + " elif n.startswith(\"layers.\"):\n", + " layernum = n.split(\"layers.\")[1].split(\".\")[0]\n", + " ff_in_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_input_0\"\n", + " ff_out_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_output_0\"\n", + " else:\n", + " assert False, f\"Module {n} not supported yet\"\n", + " return os.path.join(ff_path, ff_in_name), os.path.join(ff_path, ff_out_name)\n", + "\n", + "# Compute the hf path, check if the input and output are there\n", + "for n in named_modules:\n", + " in_name = f\"fwd_step_0_{n}.input_0\"\n", + " out_name = f\"fwd_step_0_{n}.output_0\"\n", + " if n == \"lm_head\":\n", + " in_name = f\"fwd_step_0_base_model.model.{n}.input_0\"\n", + " out_name = f\"fwd_step_0_base_model.model.{n}.output_0\"\n", + " hf_mod_in = os.path.join(hf_path, in_name)\n", + " hf_mod_out = os.path.join(hf_path, out_name)\n", + " check = os.path.exists(hf_mod_in) and os.path.exists(hf_mod_out)\n", + " \n", + " check2=True\n", + " if \"self_attn\" not in n:\n", + " ff_mod_in, ff_mod_out = convert_hf_module_name_to_ff_filenames(n, num_layers=num_layers)\n", + " check2 = os.path.exists(ff_mod_in) and os.path.exists(ff_mod_out)\n", + " print(n, check, check2)\n", + " if not check2:\n", + " print(\"\\t\", ff_mod_in, ff_mod_out)\n", + " # print(n, check)\n", + " # print(\"\\t\", )\n", + " \n", + "\n", + "# Compute the corresponding ff path, check if the input and output are there\n", + "\n", + "# for x in named_modules:\n", + "# print(x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "embed_tokens\n", - "layers\n", - "layers.0\n", - "layers.0.self_attn\n", - "layers.0.self_attn.q_proj\n", - "layers.0.self_attn.k_proj\n", - "layers.0.self_attn.v_proj\n", - "layers.0.self_attn.o_proj\n", - "layers.0.self_attn.rotary_emb\n", - "layers.0.mlp\n", - "layers.0.mlp.gate_proj\n", - "layers.0.mlp.up_proj\n", - "layers.0.mlp.down_proj\n", - "layers.0.mlp.down_proj.base_layer\n", - "layers.0.mlp.down_proj.lora_dropout\n", - "layers.0.mlp.down_proj.lora_dropout.default\n", - "layers.0.mlp.down_proj.lora_A\n", - "layers.0.mlp.down_proj.lora_A.default\n", - "layers.0.mlp.down_proj.lora_B\n", - "layers.0.mlp.down_proj.lora_B.default\n", - "layers.0.mlp.down_proj.lora_embedding_A\n", - "layers.0.mlp.down_proj.lora_embedding_B\n", - "layers.0.mlp.act_fn\n", - "layers.0.input_layernorm\n", - "layers.0.post_attention_layernorm\n", - "layers.1\n", - "layers.1.self_attn\n", - "layers.1.self_attn.q_proj\n", - "layers.1.self_attn.k_proj\n", - "layers.1.self_attn.v_proj\n", - "layers.1.self_attn.o_proj\n", - "layers.1.self_attn.rotary_emb\n", - "layers.1.mlp\n", - "layers.1.mlp.gate_proj\n", - "layers.1.mlp.up_proj\n", - "layers.1.mlp.down_proj\n", - "layers.1.mlp.down_proj.base_layer\n", - "layers.1.mlp.down_proj.lora_dropout\n", - "layers.1.mlp.down_proj.lora_dropout.default\n", - "layers.1.mlp.down_proj.lora_A\n", - "layers.1.mlp.down_proj.lora_A.default\n", - "layers.1.mlp.down_proj.lora_B\n", - "layers.1.mlp.down_proj.lora_B.default\n", - "layers.1.mlp.down_proj.lora_embedding_A\n", - "layers.1.mlp.down_proj.lora_embedding_B\n", - "layers.1.mlp.act_fn\n", - "layers.1.input_layernorm\n", - "layers.1.post_attention_layernorm\n", - "layers.2\n", - "layers.2.self_attn\n", - "layers.2.self_attn.q_proj\n", - "layers.2.self_attn.k_proj\n", - "layers.2.self_attn.v_proj\n", - "layers.2.self_attn.o_proj\n", - "layers.2.self_attn.rotary_emb\n", - "layers.2.mlp\n", - "layers.2.mlp.gate_proj\n", - "layers.2.mlp.up_proj\n", - "layers.2.mlp.down_proj\n", - "layers.2.mlp.down_proj.base_layer\n", - "layers.2.mlp.down_proj.lora_dropout\n", - "layers.2.mlp.down_proj.lora_dropout.default\n", - "layers.2.mlp.down_proj.lora_A\n", - "layers.2.mlp.down_proj.lora_A.default\n", - "layers.2.mlp.down_proj.lora_B\n", - "layers.2.mlp.down_proj.lora_B.default\n", - "layers.2.mlp.down_proj.lora_embedding_A\n", - "layers.2.mlp.down_proj.lora_embedding_B\n", - "layers.2.mlp.act_fn\n", - "layers.2.input_layernorm\n", - "layers.2.post_attention_layernorm\n", - "layers.3\n", - "layers.3.self_attn\n", - "layers.3.self_attn.q_proj\n", - "layers.3.self_attn.k_proj\n", - "layers.3.self_attn.v_proj\n", - "layers.3.self_attn.o_proj\n", - "layers.3.self_attn.rotary_emb\n", - "layers.3.mlp\n", - "layers.3.mlp.gate_proj\n", - "layers.3.mlp.up_proj\n", - "layers.3.mlp.down_proj\n", - "layers.3.mlp.down_proj.base_layer\n", - "layers.3.mlp.down_proj.lora_dropout\n", - "layers.3.mlp.down_proj.lora_dropout.default\n", - "layers.3.mlp.down_proj.lora_A\n", - "layers.3.mlp.down_proj.lora_A.default\n", - "layers.3.mlp.down_proj.lora_B\n", - "layers.3.mlp.down_proj.lora_B.default\n", - "layers.3.mlp.down_proj.lora_embedding_A\n", - "layers.3.mlp.down_proj.lora_embedding_B\n", - "layers.3.mlp.act_fn\n", - "layers.3.input_layernorm\n", - "layers.3.post_attention_layernorm\n", - "layers.4\n", - "layers.4.self_attn\n", - "layers.4.self_attn.q_proj\n", - "layers.4.self_attn.k_proj\n", - "layers.4.self_attn.v_proj\n", - "layers.4.self_attn.o_proj\n", - "layers.4.self_attn.rotary_emb\n", - "layers.4.mlp\n", - "layers.4.mlp.gate_proj\n", - "layers.4.mlp.up_proj\n", - "layers.4.mlp.down_proj\n", - "layers.4.mlp.down_proj.base_layer\n", - "layers.4.mlp.down_proj.lora_dropout\n", - "layers.4.mlp.down_proj.lora_dropout.default\n", - "layers.4.mlp.down_proj.lora_A\n", - "layers.4.mlp.down_proj.lora_A.default\n", - "layers.4.mlp.down_proj.lora_B\n", - "layers.4.mlp.down_proj.lora_B.default\n", - "layers.4.mlp.down_proj.lora_embedding_A\n", - "layers.4.mlp.down_proj.lora_embedding_B\n", - "layers.4.mlp.act_fn\n", - "layers.4.input_layernorm\n", - "layers.4.post_attention_layernorm\n", - "layers.5\n", - "layers.5.self_attn\n", - "layers.5.self_attn.q_proj\n", - "layers.5.self_attn.k_proj\n", - "layers.5.self_attn.v_proj\n", - "layers.5.self_attn.o_proj\n", - "layers.5.self_attn.rotary_emb\n", - "layers.5.mlp\n", - "layers.5.mlp.gate_proj\n", - "layers.5.mlp.up_proj\n", - "layers.5.mlp.down_proj\n", - "layers.5.mlp.down_proj.base_layer\n", - "layers.5.mlp.down_proj.lora_dropout\n", - "layers.5.mlp.down_proj.lora_dropout.default\n", - "layers.5.mlp.down_proj.lora_A\n", - "layers.5.mlp.down_proj.lora_A.default\n", - "layers.5.mlp.down_proj.lora_B\n", - "layers.5.mlp.down_proj.lora_B.default\n", - "layers.5.mlp.down_proj.lora_embedding_A\n", - "layers.5.mlp.down_proj.lora_embedding_B\n", - "layers.5.mlp.act_fn\n", - "layers.5.input_layernorm\n", - "layers.5.post_attention_layernorm\n", - "layers.6\n", - "layers.6.self_attn\n", - "layers.6.self_attn.q_proj\n", - "layers.6.self_attn.k_proj\n", - "layers.6.self_attn.v_proj\n", - "layers.6.self_attn.o_proj\n", - "layers.6.self_attn.rotary_emb\n", - "layers.6.mlp\n", - "layers.6.mlp.gate_proj\n", - "layers.6.mlp.up_proj\n", - "layers.6.mlp.down_proj\n", - "layers.6.mlp.down_proj.base_layer\n", - "layers.6.mlp.down_proj.lora_dropout\n", - "layers.6.mlp.down_proj.lora_dropout.default\n", - "layers.6.mlp.down_proj.lora_A\n", - "layers.6.mlp.down_proj.lora_A.default\n", - "layers.6.mlp.down_proj.lora_B\n", - "layers.6.mlp.down_proj.lora_B.default\n", - "layers.6.mlp.down_proj.lora_embedding_A\n", - "layers.6.mlp.down_proj.lora_embedding_B\n", - "layers.6.mlp.act_fn\n", - "layers.6.input_layernorm\n", - "layers.6.post_attention_layernorm\n", - "layers.7\n", - "layers.7.self_attn\n", - "layers.7.self_attn.q_proj\n", - "layers.7.self_attn.k_proj\n", - "layers.7.self_attn.v_proj\n", - "layers.7.self_attn.o_proj\n", - "layers.7.self_attn.rotary_emb\n", - "layers.7.mlp\n", - "layers.7.mlp.gate_proj\n", - "layers.7.mlp.up_proj\n", - "layers.7.mlp.down_proj\n", - "layers.7.mlp.down_proj.base_layer\n", - "layers.7.mlp.down_proj.lora_dropout\n", - "layers.7.mlp.down_proj.lora_dropout.default\n", - "layers.7.mlp.down_proj.lora_A\n", - "layers.7.mlp.down_proj.lora_A.default\n", - "layers.7.mlp.down_proj.lora_B\n", - "layers.7.mlp.down_proj.lora_B.default\n", - "layers.7.mlp.down_proj.lora_embedding_A\n", - "layers.7.mlp.down_proj.lora_embedding_B\n", - "layers.7.mlp.act_fn\n", - "layers.7.input_layernorm\n", - "layers.7.post_attention_layernorm\n", - "layers.8\n", - "layers.8.self_attn\n", - "layers.8.self_attn.q_proj\n", - "layers.8.self_attn.k_proj\n", - "layers.8.self_attn.v_proj\n", - "layers.8.self_attn.o_proj\n", - "layers.8.self_attn.rotary_emb\n", - "layers.8.mlp\n", - "layers.8.mlp.gate_proj\n", - "layers.8.mlp.up_proj\n", - "layers.8.mlp.down_proj\n", - "layers.8.mlp.down_proj.base_layer\n", - "layers.8.mlp.down_proj.lora_dropout\n", - "layers.8.mlp.down_proj.lora_dropout.default\n", - "layers.8.mlp.down_proj.lora_A\n", - "layers.8.mlp.down_proj.lora_A.default\n", - "layers.8.mlp.down_proj.lora_B\n", - "layers.8.mlp.down_proj.lora_B.default\n", - "layers.8.mlp.down_proj.lora_embedding_A\n", - "layers.8.mlp.down_proj.lora_embedding_B\n", - "layers.8.mlp.act_fn\n", - "layers.8.input_layernorm\n", - "layers.8.post_attention_layernorm\n", - "layers.9\n", - "layers.9.self_attn\n", - "layers.9.self_attn.q_proj\n", - "layers.9.self_attn.k_proj\n", - "layers.9.self_attn.v_proj\n", - "layers.9.self_attn.o_proj\n", - "layers.9.self_attn.rotary_emb\n", - "layers.9.mlp\n", - "layers.9.mlp.gate_proj\n", - "layers.9.mlp.up_proj\n", - "layers.9.mlp.down_proj\n", - "layers.9.mlp.down_proj.base_layer\n", - "layers.9.mlp.down_proj.lora_dropout\n", - "layers.9.mlp.down_proj.lora_dropout.default\n", - "layers.9.mlp.down_proj.lora_A\n", - "layers.9.mlp.down_proj.lora_A.default\n", - "layers.9.mlp.down_proj.lora_B\n", - "layers.9.mlp.down_proj.lora_B.default\n", - "layers.9.mlp.down_proj.lora_embedding_A\n", - "layers.9.mlp.down_proj.lora_embedding_B\n", - "layers.9.mlp.act_fn\n", - "layers.9.input_layernorm\n", - "layers.9.post_attention_layernorm\n", - "layers.10\n", - "layers.10.self_attn\n", - "layers.10.self_attn.q_proj\n", - "layers.10.self_attn.k_proj\n", - "layers.10.self_attn.v_proj\n", - "layers.10.self_attn.o_proj\n", - "layers.10.self_attn.rotary_emb\n", - "layers.10.mlp\n", - "layers.10.mlp.gate_proj\n", - "layers.10.mlp.up_proj\n", - "layers.10.mlp.down_proj\n", - "layers.10.mlp.down_proj.base_layer\n", - "layers.10.mlp.down_proj.lora_dropout\n", - "layers.10.mlp.down_proj.lora_dropout.default\n", - "layers.10.mlp.down_proj.lora_A\n", - "layers.10.mlp.down_proj.lora_A.default\n", - "layers.10.mlp.down_proj.lora_B\n", - "layers.10.mlp.down_proj.lora_B.default\n", - "layers.10.mlp.down_proj.lora_embedding_A\n", - "layers.10.mlp.down_proj.lora_embedding_B\n", - "layers.10.mlp.act_fn\n", - "layers.10.input_layernorm\n", - "layers.10.post_attention_layernorm\n", - "layers.11\n", - "layers.11.self_attn\n", - "layers.11.self_attn.q_proj\n", - "layers.11.self_attn.k_proj\n", - "layers.11.self_attn.v_proj\n", - "layers.11.self_attn.o_proj\n", - "layers.11.self_attn.rotary_emb\n", - "layers.11.mlp\n", - "layers.11.mlp.gate_proj\n", - "layers.11.mlp.up_proj\n", - "layers.11.mlp.down_proj\n", - "layers.11.mlp.down_proj.base_layer\n", - "layers.11.mlp.down_proj.lora_dropout\n", - "layers.11.mlp.down_proj.lora_dropout.default\n", - "layers.11.mlp.down_proj.lora_A\n", - "layers.11.mlp.down_proj.lora_A.default\n", - "layers.11.mlp.down_proj.lora_B\n", - "layers.11.mlp.down_proj.lora_B.default\n", - "layers.11.mlp.down_proj.lora_embedding_A\n", - "layers.11.mlp.down_proj.lora_embedding_B\n", - "layers.11.mlp.act_fn\n", - "layers.11.input_layernorm\n", - "layers.11.post_attention_layernorm\n", - "norm\n" + "{'down_proj'}\n" ] } ], "source": [ - "named_modules = [name.replace(\"base_model.model.model.\", \"\") for name, _ in model.named_modules() if \"base_model.model.model.\" in name]\n", - "for x in named_modules:\n", - " print(x)" + "print(model.peft_config['default'].target_modules)" ] }, { @@ -496,7 +570,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -519,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -535,20 +609,170 @@ "Ok!\n", "Ok!\n", "Ok!\n", - "/usr/FlexFlow/tests/peft/hf_peft_tensors/layers.0.mlp.down_proj.lora_A.default.weight True\n", - "/usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers_0_feed_forward_w2_lora_shard_0_weight_A False\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[20], line 40\u001b[0m\n\u001b[1;32m 38\u001b[0m hf_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 39\u001b[0m ff_lora_A_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_A\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 40\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_lora_A_weight_fp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_lora_A_weight_fp\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 41\u001b[0m hf_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_B.default.weight\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 42\u001b[0m ff_lora_B_weight_fp \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_layers_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_feed_forward_w2_lora_shard_0_weight_B\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m/usr/FlexFlow/tests/peft/alignment/align_test_utils.py:24\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(hf_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(hf_tensor_filepath))\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(ff_tensor_filepath, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(ff_tensor_filepath))\n\u001b[0;32m---> 24\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 25\u001b[0m hf_tensor \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mload(hf_tensor_filepath)\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(hf_tensor) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n", - "\u001b[0;31mAssertionError\u001b[0m: " + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.10.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.input_layernorm_shard_0_output_0\n", + "HF: [ 0. 0. 0. ... 0.06630182 6.3429456\n", + " -0.21220279]\n", + "FF:[ 0. 0. 0. ... 0.06630275 6.34293985\n", + " -0.21219885]\n", + "[ True True True ... True True True]\n", + "[15889]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.input_layernorm_shard_0_output_0\n", + "HF: [ 0. 0. 0. ... 0.14172177 9.79423\n", + " -6.2940273 ]\n", + "FF:[ 0. 0. 0. ... 0.14172006 9.79421902\n", + " -6.29402065]\n", + "[ True True True ... True True True]\n", + "[ 2878 3206 3367 3607 5183 5346 6257 6544 7466 7679 7805 8119\n", + " 8159 8911 9450 9897 13696 13938 14058 14599 15126 15839 16128 16195]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" ] } ], @@ -557,31 +781,33 @@ "for i in range(tot_num_layers):\n", " hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n", " ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n", - " compare_tensors(hf_input_ln_in, ff_input_ln_in)\n", + " if i > 0:\n", + " ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n", + " compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5)\n", " hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n", " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n", " if i > 0:\n", - " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_norm_shard_0_output_1\"\n", - " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5)\n", " hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n", " ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n", - " compare_tensors(hf_attn_out, ff_attn_out)\n", + " compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5)\n", " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n", " ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n", - " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", + " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5)\n", " # w1\n", " hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", " ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n", - " compare_tensors(hf_gate_proj_out, ff_gate_proj_out)\n", + " compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5)\n", " # w3\n", " hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n", - " ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_0_layers.0.mlp.up_proj_shard_0_output_0\"\n", - " compare_tensors(hf_up_proj_out, ff_up_proj_out)\n", + " ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0\"\n", + " compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5)\n", " # w2\n", " hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n", " hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n", " ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n", - " ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n", + " ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0\"\n", " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", " # LORA input\n", @@ -591,10 +817,10 @@ " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", " # LORA weights\n", " hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n", - " ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A\"\n", " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", " hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n", - " ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B\"\n", " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", " # LORA intermediate hf\n", " hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n", @@ -602,7 +828,7 @@ " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", " # LORA output\n", " hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n", - " ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n", + " ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0\"\n", " # compare_tensors(hf_lora_out, ff_lora_out)\n", " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", @@ -612,10 +838,10 @@ "# After last layer only\n", "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n", "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n", - "compare_tensors(hf_norm_out, ff_norm_out)\n", + "compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5)\n", "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", - "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n", - "compare_tensors(hf_lm_head_out, ff_lm_head_out)" + "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5)" ] }, { From 95462392bbc58ee1110c4aa2c8f3b01526e5683a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 19 May 2024 07:14:26 +0000 Subject: [PATCH 184/198] fix cross entropy loss bug --- src/ops/kernels/softmax.cu | 13 ++++++++----- tests/peft/hf_finetune.py | 18 ------------------ tests/peft/peft_alignment_test.py | 10 +++++----- 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index c8bc242af0..16f1219bf6 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -295,9 +295,11 @@ __global__ void sparse_categorical_crossentropy_loss_peft_backward( int num_tokens, int num_classes) { CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { + int class_idx = i % num_classes; + int token_idx = i / num_classes; input_grad[i] = output_grad[i]; - if (i % num_classes == token_ids[i / num_classes]) { - input_grad[i] -= 1.0f; + if (class_idx == token_ids[token_idx]) { + input_grad[i] = input_grad[i] - (DT)1.0f; } } } @@ -320,9 +322,10 @@ void peft_bwd_kernel(SoftmaxMeta const *m, tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } - int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) for (int j = 0; j < num_bwd_tokens; j++) { - token_ids[j] = bc->labelsInfo[j + tokens_previous_requests].token_id; + token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id; } DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); @@ -359,7 +362,7 @@ void peft_bwd_kernel(SoftmaxMeta const *m, DT(0.0), scale_factor); - tokens_previous_requests += num_bwd_tokens; + tokens_previous_requests += num_bwd_tokens + 1; } assert(tokens_previous_requests == bc->num_active_tokens()); } diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index cccb7cf11c..aef2bdb524 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -44,21 +44,6 @@ def print_trainable_parameters(model): ) -def lm_head_pre_backward_hook(module, grad_output): - # Fill grad input tensor with 0.5 to align other layers without having to align loss - assert len(grad_output) == 1 - assert "lm_head" in module.name - name = module.name.replace("base_model.model.model.", "") - print( - f"PRE-Backward Hook activated for module: {name}, bwd step: {module.bwd_step}" - ) - print(grad_output[0].shape) - dev = grad_output[0].device - new_grad_output = torch.full(grad_output[0].shape, 0.5).to(dev) - assert new_grad_output.shape == grad_output[0].shape - return (new_grad_output,) - - def peft_backward_hook(module, grad_input, grad_output): assert(type(grad_input) == tuple and type(grad_output) == tuple) if len(grad_input) == 0 or len(grad_output) == 0: @@ -247,9 +232,6 @@ def main(): print(f"Adding hooks to layer {layer.name}") layer.register_forward_hook(peft_forward_hook) layer.register_full_backward_hook(peft_backward_hook) - # TODO: remove hard-coding of lm head grad input after aligning the loss - if "lm_head" in name: - layer.register_full_backward_pre_hook(lm_head_pre_backward_hook) # Save any weights of interest for name, params in model.named_parameters(): simplified_name = name.replace("base_model.model.model.", "") diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index f07c65140b..c93fc0e0b0 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -122,14 +122,14 @@ def check_bwd_pass(tot_num_layers = 12): # ff_BWD_softmax_in = f"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0" print("-- LM head --") hf_BWD_lm_head_out = f"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0" - ff_BWD_lm_head_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0" + ff_BWD_lm_head_out = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0" compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5) # compare weights - hf_lm_head_weight = f"{hf_path}/base_model.model.lm_head.weight" - ff_lm_head_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0" - compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5) + # hf_lm_head_weight = f"{hf_path}/base_model.model.lm_head.weight" + # ff_lm_head_weight = f"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0" + # compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5) hf_BWD_lm_head_in = f"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0" - ff_BWD_lm_head_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0" + ff_BWD_lm_head_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_input_0" compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5) # # Manually check the matmul # ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',') From ff4b703f5b37bd3932227b5e4df35d051d9810ff Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 19 May 2024 20:03:47 +0000 Subject: [PATCH 185/198] update alignment test --- tests/peft/peft_alignment_test.py | 114 ++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index c93fc0e0b0..2d4a7cb353 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -152,6 +152,120 @@ def check_bwd_pass(tot_num_layers = 12): ff_BWD_norm_in = f"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1" compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5) + print("-- Transformers blocks --") + for i in range(tot_num_layers-1, -1, -1): + # HuggingFace filepaths + hf_BWD_norm_in = f"{hf_path}/bwd_step_0_norm.gi_0" + hf_BWD_loraB_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0" + hf_BWD_loraB_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0" + hf_BWD_loraA_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0" + hf_BWD_loraA_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0" + hf_loraA_weight = f"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight" + hf_loraB_weight = f"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight" + hf_BWD_w2_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0" + hf_BWD_w2_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0" + hf_w2_weight = f"{hf_path}/layers.{i}.mlp.down_proj.base_layer.weight" + hf_BWD_w3_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0" + hf_BWD_w3_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0" + hf_BWD_w1_out = f"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0" + hf_BWD_w1_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0" + hf_BWD_act_fn_in = f"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0" + hf_BWD_ffn_norm_out = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0" + hf_BWD_ffn_norm_in = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0" + hf_BWD_attn_out_out = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0" + + # FlexFlow filepaths + ff_BWD_w2_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0" + ff_BWD_w2_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0" + ff_BWD_w2_in_pre = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_pre_input_0" + ff_w2_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_weight_0" + ff_BWD_ssm_out = f"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0" + ff_BWD_ssm_in1 = f"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0" + ff_BWD_ssm_in2 = f"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1" + ff_BWD_w3_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0" + ff_BWD_w3_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_input_0" + ff_BWD_lora_A_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0" + ff_BWD_lora_B_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0" + ff_lora_A_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A" + ff_lora_B_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B" + ff_BWD_w1_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0" + ff_BWD_w1_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_input_0" + ff_BWD_w1_in_pre = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_pre_input_0" + ff_BWD_ffn_norm_in1 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_input_0" + ff_BWD_ffn_norm_in2 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_input_1" + ff_BWD_ffn_norm_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_0" + ff_BWD_attn_out = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0" + + # HuggingFace checks + print("\nHuggingface checks:") + if i == tot_num_layers-1: + compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out) + compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out) + compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out) + compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out) + + compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out) + check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in) + if i == tot_num_layers-1: + check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in) + + # FlexFlow checks + print("\nFlexFlow checks:") + compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out) + compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in) + compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out) + compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out) + compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out) + # compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out) + # compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in) + # compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768) + + # HF-FlexFlow checks + print("\nHuggingface-FlexFlow checks:") + print("-- W2 --") + compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5) + compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5) + + print("-- Lora --") + compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5) + compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5) + + compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out) + compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in) + + print("-- W2/W1/W3 --") + compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out) + compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in) + compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out) + compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre) + compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out) + compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in) + compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out) + + print("-- Attention --") + compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out) + num_tokens = 24 + + hf_attn_in = f"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0" + hf_attn_in = torch.load(hf_attn_in) + hf_attn_in = hf_attn_in.squeeze().T + hf_attn_in = hf_attn_in.detach().cpu().numpy() + print("hf_attn_in: ", hf_attn_in.shape) + print(hf_attn_in) + + ff_attn_in = f"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in" + ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F') + print("ff_attn_in: ", ff_attn_in.shape) + print(ff_attn_in) + #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2)) + + mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in)) + mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))] + pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1]) + print(f"{pct_mismatch*100}% mismatch in attention input grads") + assert(pct_mismatch <= 0.1) + + if __name__ == "__main__": check_weights_alignment() check_fwd_pass() From b613666dc8e5603fb534fd8bf7897b7f010fa2b5 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 20 May 2024 01:21:36 +0000 Subject: [PATCH 186/198] update test --- tests/peft/alignment/align_test_utils.py | 155 ++++++++++++++++++++--- tests/peft/peft_alignment_test.py | 97 +++++++++++--- 2 files changed, 218 insertions(+), 34 deletions(-) diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index 24da900fcb..4d202a3cc5 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -1,5 +1,6 @@ import os, re, torch import numpy as np +from typing import List abs_dirname = os.path.dirname(os.path.abspath(__file__)) hf_path = os.path.join(os.path.dirname(abs_dirname), "hf_peft_tensors") ff_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(abs_dirname))), "build", "inference_tensors") @@ -17,7 +18,18 @@ def print_unique_files_list(dirname): if layer_num > 0 and layer_num != 100: files_list.remove(f) return sorted(files_list) -def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2): +def compare_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, tolerance=1e-2): + """Check whether a HuggingFace tensor and a FlexFlow tensor are equal + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor_filepath (str): The file path of the FlexFlow tensor + tolerance (float, optional): Floating-point error tolerance for the checks. Defaults to 1e-2. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ if not os.path.exists(hf_tensor_filepath): raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found") if not os.path.exists(ff_tensor_filepath): @@ -46,7 +58,15 @@ def compare_tensors(hf_tensor_filepath, ff_tensor_filepath, tolerance=1e-2): #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) assert(len(mismatches) <= .05*len_hf_tensor) print("Ok!") -def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tensor2_filepath, tolerance=1e-2): +def compare_tensors_difference(hf_tensor_filepath: str, ff_tensor1_filepath: str, ff_tensor2_filepath: str, tolerance: float = 1e-2): + """Check whether a HuggingFace tensor is equal to the difference between two FlexFlow tensors + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor1_filepath (str): The file path of the first FlexFlow tensor + ff_tensor2_filepath (str): The file path of the second FlexFlow tensor + tolerance (float, optional): The floating-point error tolerance for the equality check. Defaults to 1e-2. + """ assert(os.path.exists(hf_tensor_filepath)) assert(os.path.exists(ff_tensor1_filepath)) assert(os.path.exists(ff_tensor2_filepath)) @@ -77,8 +97,17 @@ def compare_tensors_difference(hf_tensor_filepath, ff_tensor1_filepath, ff_tenso #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) assert(len(mismatches) <= .05*len_hf_tensor) print("Ok!") -def compare_hf_tensors(tensor1_fp, tensor2_fp): - assert(os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp)) +def compare_hf_tensors(tensor1_fp: str, tensor2_fp: str): + """Checks whether two HuggingFace tensors are equal + + Args: + tensor1_fp (str): The file path of the first tensor + tensor2_fp (str): The file path of the second tensor + """ + if not os.path.exists(tensor1_fp): + raise FileNotFoundError(f"HF tensor file: {tensor1_fp} not found") + if not os.path.exists(tensor2_fp): + raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found") hf_tensor1 = torch.load(tensor1_fp) hf_tensor2 = torch.load(tensor2_fp) if type(hf_tensor1) == tuple or type(hf_tensor1) == list: @@ -100,8 +129,20 @@ def compare_hf_tensors(tensor1_fp, tensor2_fp): assert(False) print("Ok!") -def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp): - assert(os.path.exists(tensor_sum_fp) and os.path.exists(tensor1_fp) and os.path.exists(tensor2_fp)) +def check_hf_sum_tensors(tensor_sum_fp: str, tensor1_fp: str, tensor2_fp: str): + """Checks whether a HuggingFace tensor is equal to the sum of two other HuggingFace tensors + + Args: + tensor_sum_fp (str): The file path of the sum tensor + tensor1_fp (str): The file path of the first tensor + tensor2_fp (str): The file path of the second tensor + """ + if not os.path.exists(tensor_sum_fp): + raise FileNotFoundError(f"HF tensor file: {tensor_sum_fp} not found") + if not os.path.exists(tensor1_fp): + raise FileNotFoundError(f"HF tensor file {tensor1_fp} not found") + if not os.path.exists(tensor2_fp): + raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found") hf_tensor_sum = torch.load(tensor_sum_fp) hf_tensor1 = torch.load(tensor1_fp) hf_tensor2 = torch.load(tensor2_fp) @@ -131,14 +172,27 @@ def check_hf_sum_tensors(tensor_sum_fp, tensor1_fp, tensor2_fp): print(mismatches) assert(False) print("Ok!") -def check_hf_zero_tensor(hf_tensor_fp): - assert(os.path.exists(hf_tensor_fp)) +def check_hf_zero_tensor(hf_tensor_fp: str): + """Check whether a HuggingFace tensor is a zero tensor + + Args: + hf_tensor_fp (str): The file path of the HuggingFace tensor + """ + if not os.path.exists(hf_tensor_fp): + raise FileNotFoundError(f"HF tensor file: {hf_tensor_fp} not found") hf_tensor1 = torch.load(hf_tensor_fp) if type(hf_tensor1) == tuple or type(hf_tensor1) == list: assert(len(hf_tensor1) == 1) hf_tensor1 = hf_tensor1[0] assert(torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0) -def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=""): +def print_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, txt: str = ""): + """Print the contents of a HuggingFace tensor and a FlexFlow tensor + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor_filepath (str): The file path of the FlexFlow tensor + txt (str, optional): Additional text to prepend to the tensors. Defaults to "". + """ assert(os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)) hf_tensor = torch.load(hf_tensor_filepath) if type(hf_tensor) == tuple or type(hf_tensor) == list: @@ -155,7 +209,23 @@ def print_tensors(hf_tensor_filepath, ff_tensor_filepath, txt=""): print(hf_tensor) print(f"{txt} - FF tensor: ") print(ff_tensor) -def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_len=-1): +def compare_flexflow_tensors(ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5, max_len: int = -1): + """Check whether two FlexFlow tensors are equal + + Args: + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (float, optional): Floating-point error tolernace for the check. Defaults to 1e-5. + max_len (int, optional): Maximum number of elements to check (if > 0). Defaults to -1. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)) ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',') ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',') @@ -178,8 +248,22 @@ def compare_flexflow_tensors(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5, max_l #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) assert(len(mismatches) <= .05*len(ff_tensor1)) print("Ok!") -def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5): - assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)) +def compare_flexflow_tensors_shortest(ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5): + """Compare two FlexFlow tensors up to the maximum length of the shortest tensor + + Args: + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (float, optional): Floating point error tolerance for the check. Defaults to 1e-5. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',') ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',') minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0]) @@ -195,8 +279,23 @@ def compare_flexflow_tensors_shortest(ff_tensor1_fp, ff_tensor2_fp, tolerance=1e #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) assert(len(mismatches) <= .05*len(ff_tensor1)) print("Ok!") -def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, tolerance=1e-5): - assert(os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)) +def check_flexflow_tensors_sum(ff_tensor_sum_fp: str, ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance=1e-5): + """Check whether a FlexFlow tensor is equal to the sum of two other FlexFlow tensors + + Args: + ff_tensor_sum_fp (str): The file path of the FlexFlow sum tensor + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (_type_, optional): Floating-point error tolerance for the check. Defaults to 1e-5. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=',') ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=',') ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=',') @@ -215,18 +314,42 @@ def check_flexflow_tensors_sum(ff_tensor_sum_fp, ff_tensor1_fp, ff_tensor2_fp, t #assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) assert(len(mismatches) <= .05*len(ff_tensor1)) print("Ok!") -def load_ff_tensor(filename, shape): +def load_ff_tensor(filename: str, shape: List[int]): + """Load a FlexFlow tensor from a file as a numpy array + + Args: + filename (str): The file path of the FF tensor + shape (List[int]): The shape of the FF tensor + + Returns: + _type_: The FF tensor as a numpy array + """ if ff_path not in filename: filename = os.path.join(ff_path, filename) ff_tensor = np.loadtxt(filename, delimiter=',').reshape(shape, order = 'F') return ff_tensor -def load_hf_tensor(filename): +def load_hf_tensor(filename: str): + """Load a HuggingFace tensor from a file as a numpy array + + Args: + filename (str): The file path of the HF tensor + + Returns: + _type_: The HF tensor as a numpy array + """ if hf_path not in filename: filename = os.path.join(hf_path, filename) hf_tensor = torch.load(filename) hf_tensor = hf_tensor.detach().cpu().numpy() return hf_tensor def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2): + """Check whether a Huggingface and a FlexFlow tensors, both loaded to memory in the form of a numpy array, are equal + + Args: + hf_tensor (_type_): The HuggingFace tensor (in numpy array form) + ff_tensor (_type_): The FlexFlow tensor (in numpy array form) + tolerance (_type_, optional): The floating point error tolerance for the check. Defaults to 1e-2. + """ assert(hf_tensor.shape == ff_tensor.shape) mismatches = [] if not np.allclose(hf_tensor, ff_tensor, atol=tolerance): diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index 2d4a7cb353..c75f6f7d3f 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -173,6 +173,15 @@ def check_bwd_pass(tot_num_layers = 12): hf_BWD_ffn_norm_out = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0" hf_BWD_ffn_norm_in = f"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0" hf_BWD_attn_out_out = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0" + hf_BWD_attn_q_in = f"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0" + hf_FWD_w1_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0" + hf_FWD_w3_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0" + hf_FWD_act_fn_out = f"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0" + hf_BWD_attn_oproj_in = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0" + hf_attn_qproj_weight = f"{hf_path}/layers.{i}.self_attn.q_proj.weight" + hf_attn_kproj_weight = f"{hf_path}/layers.{i}.self_attn.k_proj.weight" + hf_attn_vproj_weight = f"{hf_path}/layers.{i}.self_attn.v_proj.weight" + hf_attn_oproj_weight = f"{hf_path}/layers.{i}.self_attn.o_proj.weight" # FlexFlow filepaths ff_BWD_w2_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0" @@ -195,7 +204,9 @@ def check_bwd_pass(tot_num_layers = 12): ff_BWD_ffn_norm_in2 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_input_1" ff_BWD_ffn_norm_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_0" ff_BWD_attn_out = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0" - + ff_BWD_attn_o_proj_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_o_proj_in_grad" + ff_attn_oproj_weight = f"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_weight_0" + # HuggingFace checks print("\nHuggingface checks:") if i == tot_num_layers-1: @@ -217,7 +228,7 @@ def check_bwd_pass(tot_num_layers = 12): compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out) compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out) # compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out) - # compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in) + compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in) # compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768) # HF-FlexFlow checks @@ -243,27 +254,77 @@ def check_bwd_pass(tot_num_layers = 12): compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out) print("-- Attention --") - compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out) num_tokens = 24 + hidden_size = 768 + qProjSize = 64 + num_heads = 12 + # Check output + compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out) + + # Check weights + ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',') + ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F') + ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F') + ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F') + ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F') + + hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy() + hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy() + hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy() + hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy() + + assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5)) + assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5)) + assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5)) + assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5)) + # Compare attn outproj grad in tensors + compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in) + + # Compare vproj grads + hf_vproj_grads = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0" + ff_vproj_grads = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_v_proj_in_grad" + hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy() + ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F') + compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads) + + # Compare kproj grads + ff_kproj = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_devkproj" + ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F') + hf_kproj_grads = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0" + hf_kproj_grads = torch.load(hf_kproj_grads).squeeze() + reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy() + assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2)) + print("Ok!") + + # Compare qproj grads + hf_qproj_grads = f"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0" + hf_qproj_grads = torch.load(hf_qproj_grads).squeeze() + reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy() + ff_qproj = ff_path + f"/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_devQKVPRojArray" + ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0] + assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2)) + print("Ok!") + + # Compare attn grad input hf_attn_in = f"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0" - hf_attn_in = torch.load(hf_attn_in) - hf_attn_in = hf_attn_in.squeeze().T - hf_attn_in = hf_attn_in.detach().cpu().numpy() - print("hf_attn_in: ", hf_attn_in.shape) - print(hf_attn_in) + ff_attn_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_attn_final_grad_in" + compare_tensors(hf_attn_in, ff_attn_in) - ff_attn_in = f"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in" - ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F') - print("ff_attn_in: ", ff_attn_in.shape) - print(ff_attn_in) - #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2)) + # compare input layernorm + print("-- Input LayerNorm --") + if i > 0: + ff_input_ln_out = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1" + ff_attn_operator_in = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_input_0" + compare_flexflow_tensors(ff_attn_operator_in, ff_input_ln_out) + hf_input_ln_in = f"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.gi_0" + ff_input_ln_in0 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0" + ff_input_ln_in1 = f"{ff_path}/bwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_1" + compare_flexflow_tensors(ff_input_ln_in0, ff_input_ln_in1) + if i > 1: + compare_tensors(hf_input_ln_in, ff_input_ln_in0) + - mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in)) - mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))] - pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1]) - print(f"{pct_mismatch*100}% mismatch in attention input grads") - assert(pct_mismatch <= 0.1) if __name__ == "__main__": From dde0b61d28e3b7f5dd00ab7236e2d12d5cc20c74 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 20 May 2024 01:24:49 +0000 Subject: [PATCH 187/198] add llama peft alignment test to ci --- .github/workflows/gpu-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index b5260ead05..b78df90a5f 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -177,6 +177,7 @@ jobs: # PEFT tests ./tests/peft_tests.sh + python ./tests/peft/peft_alignment_test.py - name: Save inference output as an artifact if: always() From 1a31b65e5cb8de00c251d3bd53c8858d4a0e71cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= Date: Fri, 24 May 2024 21:41:19 +0000 Subject: [PATCH 188/198] Fix values for unused params in incr_decoding --- python/flexflow/core/flexflow_cffi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index ec4cacfa6d..fdbab8eb89 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -4406,8 +4406,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1 for prompt in prompt_list ] max_sequence_lengths = [max_sequence_length for prompt in prompt_list] - peft_model_ids = [None for prompt in prompt_list] - dataset_filepaths = [None for prompt in prompt_list] + peft_model_ids = [PEFTModelID().handle for prompt in prompt_list] # Assign Dummy model ids + dataset_filepaths = [ffi.NULL for prompt in prompt_list] training_steps = [0 for prompt in prompt_list] ffc().flexflow_model_generate( self.handle, From 7e3d1111e35f3be7f3a091e1bb00edbfba5195cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= Date: Fri, 24 May 2024 23:30:50 +0000 Subject: [PATCH 189/198] Add PEFTModelID NO_ID singleton instead of None --- include/flexflow/flexflow_c.h | 2 ++ python/flexflow/core/flexflow_cffi.py | 14 ++++++++++++-- src/c/flexflow_c.cc | 6 ++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index b651b31052..97a382ee8b 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -1068,6 +1068,8 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create(); flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id); +flexflow_peft_model_id_t flexflow_peft_model_id_no_id(); + void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_); #ifdef __cplusplus diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index fdbab8eb89..aa414f74d7 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1766,6 +1766,8 @@ def __init__( class PEFTModelID(object): __slots__ = ["handle", "_handle"] + __no_id_h = None + def __init__(self, id=None): if id is None: self.handle = ffc().flexflow_peft_model_id_create() @@ -1773,6 +1775,11 @@ def __init__(self, id=None): self.handle = ffc().flexflow_peft_model_id_create_id(id) self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy) + @staticmethod + def no_id_handle(): + if PEFTModelID.__no_id_h is None: + PEFTModelID.__no_id_h = ffc().flexflow_peft_model_id_no_id() + return PEFTModelID.__no_id_h # ----------------------------------------------------------------------- # Request @@ -4406,7 +4413,7 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1 for prompt in prompt_list ] max_sequence_lengths = [max_sequence_length for prompt in prompt_list] - peft_model_ids = [PEFTModelID().handle for prompt in prompt_list] # Assign Dummy model ids + peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list] dataset_filepaths = [ffi.NULL for prompt in prompt_list] training_steps = [0 for prompt in prompt_list] ffc().flexflow_model_generate( @@ -4451,7 +4458,10 @@ def generate(self, requests_list: List[Request]): max_sequence_lengths = [ request.max_sequence_length for request in requests_list ] - peft_model_ids = [request.peft_model_id for request in requests_list] + peft_model_ids = [ + (request.peft_model_id + if request.peft_model_id is not None else PEFTModelID.no_id_handle()) + for request in requests_list] dataset_filepaths = [ get_c_name(request.dataset_filepath) for request in requests_list ] diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 993d1b6a0d..e5f42c7df8 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2845,6 +2845,12 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) { return FFCObjectWrapper::wrap(handle); } +flexflow_peft_model_id_t flexflow_peft_model_id_no_id() { + PEFTModelID handle = PEFTModelID::NO_ID; + DEBUG_PRINT("[PEFTModelID] new %p", &handle); + return FFCObjectWrapper::wrap(&handle); +} + void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_); DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id); From 079ba5932360158928ad73db5a20e5ea7515c6aa Mon Sep 17 00:00:00 2001 From: Remi <54138269+Flechman@users.noreply.github.com> Date: Fri, 24 May 2024 22:50:46 -0400 Subject: [PATCH 190/198] Fix PEFTModelID::NO_ID reference --- src/c/flexflow_c.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e5f42c7df8..43fcd55a02 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2846,9 +2846,9 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) { } flexflow_peft_model_id_t flexflow_peft_model_id_no_id() { - PEFTModelID handle = PEFTModelID::NO_ID; - DEBUG_PRINT("[PEFTModelID] new %p", &handle); - return FFCObjectWrapper::wrap(&handle); + PEFTModelID *handle = const_cast(&PEFTModelID::NO_ID); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); } void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { From f464eb8911f2b845ea72ef3bf6e985bac07405e2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 25 May 2024 20:27:09 +0000 Subject: [PATCH 191/198] reduce logging --- src/runtime/graph.cc | 19 +++++++------- src/runtime/inference_manager.cc | 43 ++++++++++++++++---------------- src/runtime/model.cc | 2 +- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index dae0021bb6..e5b1eb3631 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2480,6 +2480,7 @@ namespace FlexFlow { using PCG::Edge; using PCG::Graph; using PCG::GraphCostResult; +using PCG::log_graph; using PCG::Node; void FFModel::register_all_machine_views( @@ -3158,20 +3159,20 @@ void FFModel::deserialize_graph_optimal_view( optimal_views[guid_to_nodes[guid]] = view; } assert(dez.get_remaining_bytes() == 0); - printf("Deserialized Views...\n"); + log_graph.debug("Deserialized Views...\n"); for (auto const &it : optimal_views) { - printf("node[%zu]: type(%s) view(%d %d %d) ", - it.first.guid, - it.first.to_string().c_str(), - it.second.ndims, - it.second.dim[0], - it.second.start_device_id); + log_graph.debug("node[%zu]: type(%s) view(%d %d %d) ", + it.first.guid, + it.first.to_string().c_str(), + it.second.ndims, + it.second.dim[0], + it.second.start_device_id); auto const &list = graph->inEdges.at(it.first); for (auto const &it2 : list) { Edge e = it2; - printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); + log_graph.debug(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); } - printf("\n"); + log_graph.debug("\n"); } } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 212d0ebf6b..cc967b0cfe 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -217,7 +217,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } } if (!found_parallel_tensor) { - log_offload.print( + log_offload.debug( "Cannot find a previous tensor for operator(%d) output_idx(%d)", op_idx, i); @@ -308,34 +308,35 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) { continue; } - printf("operator[%zu]: type(%s) guid(%lu)\n", - i, - get_operator_type_name(model->operators[i]->op_type).c_str(), - model->operators[i]->op_guid); + log_inf_mgr.debug( + "operator[%zu]: type(%s) guid(%lu)\n", + i, + get_operator_type_name(model->operators[i]->op_type).c_str(), + model->operators[i]->op_guid); for (int j = 0; j < op->numInputs; j++) { assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region; - printf("\tinputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\tinputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } for (int j = 0; j < op->numOutputs; j++) { LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region; - printf("\toutputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\toutputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } for (int j = 0; j < op->numWeights; j++) { LogicalRegion handle = op->weights[j]->region; - printf("\tweights[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\tweights[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } } } @@ -665,7 +666,7 @@ void FFModel::compile_inference() { deserialize_graph_optimal_view(dez, best_graph, optimal_views); operators.clear(); convert_graph_to_operators(best_graph, optimal_views); - best_graph->print_dot(); + // best_graph->print_dot(); delete best_graph; for (auto const &layer : layers) { // map inputs to parallel tensor diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 2b6994c7b2..b28d3d7701 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3478,7 +3478,7 @@ void FFModel::compile(LossType loss_type, deserialize_graph_optimal_view(dez, best_graph, optimal_views); operators.clear(); convert_graph_to_operators(best_graph, optimal_views); - best_graph->print_dot(); + // best_graph->print_dot(); delete best_graph; for (auto const &layer : layers) { // map inputs to parallel tensor From 8d89acdeee834e4b70caed7505937c5dbc07a121 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 26 May 2024 22:18:03 +0000 Subject: [PATCH 192/198] fix --- inference/python/spec_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index e8ef68b240..39529abda3 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -79,7 +79,7 @@ def get_configs(): "full_precision": False, } ], - # "prompt": "", + "prompt": "", "output_file": "", } # Merge dictionaries From 33c0fefc9c3b2dbbe4e29158e249a29d9747812c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 29 May 2024 08:02:52 +0000 Subject: [PATCH 193/198] fix --- python/flexflow/serve/models/starcoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 0cbb6d976c..2d4471201f 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -225,7 +225,7 @@ def build_model(self, max_tokens_per_batch): def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.h", "layers").replace("transformer", "") + name = name.replace("transformer.h", "layers").replace("transformer.", "") if "attn.c_attn.weight" in name: name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") From 6727d3a8fde6d98aca1c657373f0c311e617c78d Mon Sep 17 00:00:00 2001 From: Remi Delacourt Date: Tue, 11 Jun 2024 20:48:13 +0000 Subject: [PATCH 194/198] Add peft demo --- inference/python/peft_demo/demo.py | 117 ++++++++++++++++++++ inference/python/peft_demo/demo_config.json | 29 +++++ 2 files changed, 146 insertions(+) create mode 100644 inference/python/peft_demo/demo.py create mode 100644 inference/python/peft_demo/demo_config.json diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py new file mode 100644 index 0000000000..651d281241 --- /dev/null +++ b/inference/python/peft_demo/demo.py @@ -0,0 +1,117 @@ +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace +from datasets import load_dataset +import random + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs.", + type=str, + default="", + required=True, + ) + args = parser.parse_args() + + # Load configs from JSON file + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + +def init_llm_co_serving(configs_dict, configs): + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + # Add the different PEFT models to finetune + for peft_model_id in configs.peft_model_ids: + llm.add_peft(peft_model_id) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + enable_peft_finetuning = (len(configs.finetuning_dataset) > 0), + max_requests_per_batch=1, + max_seq_length=256, + max_tokens_per_batch=64, + ) + +# Data comes from https://huggingface.co/datasets/databricks/databricks-dolly-15k +def import_dataset(): + inference_percentage = 0.6 + dataset = load_dataset("databricks/databricks-dolly-15k", split="train") + data = [] + for i,row in enumerate(dataset): + if len(row['context']) == 0: + data.append((row['instruction'],row['response'])) + inference_prompts = [] + finetuning_prompts = [] + for d in data: + if random.random() <= inference_percentage: + inference_prompts.append(d[0]) + else: + finetuning_prompts.append(d) + return inference_prompts, finetuning_prompts + + +if __name__ == "__main__": + print("Co-Serving Demo") + # Import config parameters + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + random.seed(configs.seed) + # Import inference dataset + # Import finetuning dataset + inference_prompts, finetuning_prompts = import_dataset() + # Initialize Llama2 lora model + llm = init_llm_co_serving(configs_dict, configs) + llm.start_server() + requests = [] + # Prepare inference requests + inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, + prompt=prompt, + max_sequence_length=configs.max_sequence_length + ) + for prompt in inference_prompts + ] + requests += inference_requests + # Prepare finetuning requests + for peft_model_id in configs.peft_model_ids: + finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(peft_model_id), + dataset=finetuning_prompts, + ) + requests.append(finetuning_request) + # Jointly serve inference and finetuning requests + llm.generate(requests, max_length=configs.max_sequence_length) + llm.stop_server() + # Show statistics and metrics of the system + ## Show difference in loss on test dataset with finetuned and non-finetuned to prove that it works + ## Show compute resources utilized + other metrics + ## Compare with compute resources utilized without co-serving \ No newline at end of file diff --git a/inference/python/peft_demo/demo_config.json b/inference/python/peft_demo/demo_config.json new file mode 100644 index 0000000000..aca759e681 --- /dev/null +++ b/inference/python/peft_demo/demo_config.json @@ -0,0 +1,29 @@ +{ + "seed": 42, + "num_gpus": 4, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 40000, + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 4, + "offload": false, + "offload_reserve_space_size": 8192, + "use_4bit_quantization": false, + "use_8bit_quantization": false, + "enable_peft": true, + "peft_activation_reserve_space_size": 1024, + "peft_weight_reserve_space_size": 1024, + "profiling": false, + "benchmarking": false, + "inference_debugging": false, + "fusion": true, + "base_model": "meta-llama/Llama-2-7b-hf", + "peft_model_ids": ["goliaro/llama-2-7b-lora-full"], + "max_sequence_length": 128, + "cache_path": "", + "refresh_cache": false, + "full_precision": true, + "output_file": "../output/peft.txt" +} \ No newline at end of file From 6d7c245c51037c430328c45b9d9485e0c12f1e6a Mon Sep 17 00:00:00 2001 From: Remi Delacourt Date: Tue, 11 Jun 2024 21:05:45 +0000 Subject: [PATCH 195/198] Add readme for demo --- inference/python/peft_demo/INSTRUCTIONS.md | 25 +++++++++++++++++++++ inference/python/peft_demo/demo_config.json | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 inference/python/peft_demo/INSTRUCTIONS.md diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md new file mode 100644 index 0000000000..9b2a7a53b2 --- /dev/null +++ b/inference/python/peft_demo/INSTRUCTIONS.md @@ -0,0 +1,25 @@ +## Peft Demo +* `git clone -b peft --recursive https://github.com/flexflow/FlexFlow.git` +* `cd FlexFlow/` + +* If you wish to run the demo by installing FlexFlow + * `conda env create -f conda/flexflow.yml` + * `conda activate flexflow` + +* If you wish to run the demo using a Docker container + * `export FF_CUDA_ARCH=all && export cuda_version=12.0 && ./docker/build.sh flexflow && ./docker/run.sh flexflow` + +* Then, install the Llama2 model (the `meta-llama/Llama-2-7b-hf` model is gated, so make sure to add your HF access token) + + * `export HUGGINGFACE_TOKEN="[Your token]"` + * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"` + * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"` + +* Run the demo + ``` + mkdir inference/output + cd inference/python/peft_demo/ + python3 demo.py -config-file demo_config.json + ``` + + diff --git a/inference/python/peft_demo/demo_config.json b/inference/python/peft_demo/demo_config.json index aca759e681..fa8f577e04 100644 --- a/inference/python/peft_demo/demo_config.json +++ b/inference/python/peft_demo/demo_config.json @@ -25,5 +25,5 @@ "cache_path": "", "refresh_cache": false, "full_precision": true, - "output_file": "../output/peft.txt" + "output_file": "../../output/peft_demo.txt" } \ No newline at end of file From 511fd649da6ec3587d11da12cf0389b746dfd569 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 20 Jun 2024 02:06:31 +0000 Subject: [PATCH 196/198] fix alignment issue --- include/flexflow/config.h | 27 +++-- include/flexflow/utils/cuda_helper.h | 1 + inference/MODEL_WEIGHTS.md | 28 ----- inference/README.md | 27 +++++ src/c/flexflow_c.cc | 2 +- src/ops/inc_multihead_self_attention.cu | 9 +- src/ops/spec_inc_multihead_self_attention.cu | 43 ++------ src/ops/tree_inc_multihead_self_attention.cu | 20 ++-- src/runtime/cuda_helper.cu | 17 +++ src/runtime/model.cpp | 4 +- src/runtime/model.cu | 4 +- src/runtime/request_manager.cpp | 45 ++++---- src/runtime/request_manager.cu | 108 ++++++++----------- 13 files changed, 151 insertions(+), 184 deletions(-) delete mode 100644 inference/MODEL_WEIGHTS.md create mode 100644 inference/README.md diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 3cf985f279..dd9d657117 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -68,6 +68,23 @@ class FFConfig; class MemoryAllocator; class PEFTWeightAllocator; +struct CombinedBatchConfigMetaStruct { + BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS]; + BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS]; + BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS]; + bool request_completed[BatchConfig::MAX_NUM_REQUESTS]; + + BeamSearchBatchConfig::BeamSearchPerTokenInfo + beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS + + BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM * + BeamSearchBatchConfig::MAX_NUM_REQUESTS]; + BeamSearchBatchConfig::BeamSearchPerRequestInfo + beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS]; + + TreeVerifyBatchConfig::CommittedTokensInfo + committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS]; +}; + struct FFHandler { #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnHandle_t dnn; @@ -78,16 +95,10 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; - void *batch_config_metadata; + CombinedBatchConfigMetaStruct *batch_config_metadata; // request info + token info + topolopgy mask info - size_t batch_config_metadata_size = - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens) + - sizeof(BatchConfig::request_completed); + size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct); void *offload_reserve_space; size_t offload_reserve_space_size; // PEFT related fields diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index caaa54683a..486a65eb3d 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -183,3 +183,4 @@ cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type); cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type); #endif void check_device_vs_host_ptr(void const *maybe_devicePtr); +void check_ptr_alignment(void const *ptr); diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md deleted file mode 100644 index d78fb37be9..0000000000 --- a/inference/MODEL_WEIGHTS.md +++ /dev/null @@ -1,28 +0,0 @@ -To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files. - -```python -from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") - -for name, params in model.named_parameters(): - for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) - params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name) -``` - diff --git a/inference/README.md b/inference/README.md new file mode 100644 index 0000000000..7ddf118715 --- /dev/null +++ b/inference/README.md @@ -0,0 +1,27 @@ +# Inference Examples +This folder contains the code to run inference examples in FlexFlow + +To create a sample prompt, call (from the `build` folder): + +```bash +mkdir -p ../inference/prompt +echo '["San Francisco is a "]' > ../inference/prompt/test.json +``` + +To download a model for use in C++, call: +```bash +huggingface-cli login # if needed +python ../inference/utils/download_hf_model.py meta-llama/Llama-2-7b-hf --half-precision-only +``` + +To run the incremental decoding example in C++, call: + +```bash +./inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 +``` + +To run the speculative inference example in C++, call: + +```bash +./inference/spec_infer/spec_infer -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 +``` diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 43fcd55a02..76ca5053d6 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -2846,7 +2846,7 @@ flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) { } flexflow_peft_model_id_t flexflow_peft_model_id_no_id() { - PEFTModelID *handle = const_cast(&PEFTModelID::NO_ID); + PEFTModelID *handle = const_cast(&PEFTModelID::NO_ID); DEBUG_PRINT("[PEFTModelID] new %p", handle); return FFCObjectWrapper::wrap(handle); } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index d1b93cb206..aa98dc4964 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -2088,11 +2088,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); - token_infos = - static_cast(handler.batch_config_metadata); - request_infos = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo)); + token_infos = static_cast( + handler.batch_config_metadata->tokens_info); + request_infos = static_cast( + handler.batch_config_metadata->requestsInfo); if (offload) { // token_infos = diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 6c3ef9895b..4688a8233c 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -542,20 +542,9 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, DT const *A = static_cast
(m->devQKVProjArray) + bc->requestsInfo[i].first_token_offset_in_batch * m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - - // print_tensor((float*)A, 32, "A"); DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + DT *C = static_cast
(m->qk_prods); - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -855,29 +844,15 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { beam_token_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); - + static_cast( + handler.batch_config_metadata->beamTokenInfo); beam_request_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask)); + static_cast( + handler.batch_config_metadata->beamRequestsInfo); + causalMask = static_cast( + handler.batch_config_metadata->causalMask); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } cudaStreamSynchronize(stream); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 909b34aa5f..02f39192df 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -1061,21 +1061,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); + causalMask = static_cast( + handler.batch_config_metadata->causalMask); committed_token_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens)); + static_cast( + handler.batch_config_metadata->committed_tokens); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } cudaStreamSynchronize(stream); diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 56294c5e35..880a570b0c 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -646,6 +646,23 @@ void check_device_vs_host_ptr(void const *maybe_devicePtr) { } } +void check_ptr_alignment(void const *ptr) { + if (!ptr) { + printf("Pointer is NULL\n"); + return; + } + bool aligned2 = ((uintptr_t)ptr % 2 == 0); + bool aligned4 = ((uintptr_t)ptr % 4 == 0); + bool aligned8 = ((uintptr_t)ptr % 8 == 0); + bool aligned16 = ((uintptr_t)ptr % 16 == 0); + printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n", + ptr, + (aligned2 ? "yes" : "no"), + (aligned4 ? "yes" : "no"), + (aligned8 ? "yes" : "no"), + (aligned16 ? "yes" : "no")); +} + template __global__ void assign_kernel(half *ptr, coord_t size, half value); template __global__ void diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index ad2b781567..0a8253dd2f 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -174,8 +174,8 @@ FFHandler 0, Realm::ProfilingRequestSet()) .wait(); - handle.batch_config_metadata = - workspaceInst.pointer_untyped(0, sizeof(char)); + handle.batch_config_metadata = static_cast( + workspaceInst.pointer_untyped(0, sizeof(char))); } else { handle.batch_config_metadata = nullptr; } diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 5e07ae0894..56b1e2a6a5 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -172,8 +172,8 @@ FFHandler 0, Realm::ProfilingRequestSet()) .wait(); - handle.batch_config_metadata = - workspaceInst.pointer_untyped(0, sizeof(char)); + handle.batch_config_metadata = static_cast( + workspaceInst.pointer_untyped(0, sizeof(char))); } else { handle.batch_config_metadata = nullptr; } diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index fadbf80d6d..8e5f302466 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -73,74 +73,69 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - size_t total_copy_size = 0; - checkCUDA(hipMemcpyAsync(handle.batch_config_metadata, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->tokens_info, &(batch_config->tokensInfo), sizeof(BatchConfig::tokensInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::tokensInfo); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->requestsInfo, &(batch_config->requestsInfo), sizeof(BatchConfig::requestsInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamTokenInfo, &(beam_batch_config->beamTokenInfo), sizeof(BeamSearchBatchConfig::beamTokenInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo, &(beam_batch_config->beamRequestsInfo), sizeof(BeamSearchBatchConfig::beamRequestsInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask, &(beam_batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + hipMemcpyHostToDevice, + stream)); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask, &(tree_batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->committed_tokens, &(tree_batch_config->committed_tokens), sizeof(TreeVerifyBatchConfig::committed_tokens), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - } - // add a size check - assert(total_copy_size <= handle.batch_config_metadata_size); + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + hipMemcpyHostToDevice, + stream)); + } } void RequestManager::load_positions_task( diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 235d435580..343f1dd6e6 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -93,91 +93,69 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - size_t total_copy_size = 0; - checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->tokens_info, &(batch_config->tokensInfo), sizeof(BatchConfig::tokensInfo), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::tokensInfo); - checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->requestsInfo, &(batch_config->requestsInfo), sizeof(BatchConfig::requestsInfo), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BatchConfig::request_completed); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamTokenInfo, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->committed_tokens), - sizeof(TreeVerifyBatchConfig::committed_tokens), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BatchConfig::request_completed); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->committed_tokens, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); } - - // add a size check - assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_positions_task( From 2899ba29e92ec42787a45adcd0e798a7a89d4ab0 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 9 Jul 2024 01:52:30 +0000 Subject: [PATCH 197/198] Initial implemention of disaggregated attention and qkv projection --- .../inc_multihead_self_attention_kernels.h | 10 + inference/models/llama.cc | 35 ++- src/ops/inc_multihead_self_attention.cc | 2 +- src/ops/inc_multihead_self_attention.cpp | 187 +++++++------ src/ops/inc_multihead_self_attention.cu | 163 +++++++++++- src/ops/spec_inc_multihead_self_attention.cpp | 89 ++++--- src/ops/spec_inc_multihead_self_attention.cu | 23 +- src/ops/tree_inc_multihead_self_attention.cpp | 89 ++++--- src/ops/tree_inc_multihead_self_attention.cu | 36 ++- src/runtime/file_loader.cc | 248 ++++++++++++++++++ 10 files changed, 707 insertions(+), 175 deletions(-) diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 9bf2f581e2..552d5e3496 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -100,6 +100,16 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, DT const *bias_ptr, ffStream_t stream); +template +void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + // DT const *input_ptr, + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + ffStream_t stream); + template void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 4be232e81b..e4b2e5a537 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -92,11 +92,26 @@ void LLAMA::create_llama_model(FFModel &ff, att_norm = token_att_norm[1]; } + Tensor qkv_proj = ff.dense( + att_norm, + llama_config.hidden_size * 3, // q, k, v. need to change if want to remove replication. (q_heads + 2 * kv_heads) * proj_size + AC_MODE_NONE, + false, // seems like llama does not use bias + DT_NONE, // what is this + nullptr, // ? + nullptr, // ? + nullptr, // ? + REG_MODE_NONE, // no regularization + 0.0f, // no dropout + std::string("layers_" + std::to_string(i) + "_attn_qkv_proj") + .c_str() + ); + Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multihead_self_attention( - att_norm, + qkv_proj, llama_config.hidden_size, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, @@ -119,7 +134,7 @@ void LLAMA::create_llama_model(FFModel &ff, } case TREE_VERIFY_MODE: { mha = ff.inc_multihead_self_attention_verify( - att_norm, + qkv_proj, llama_config.hidden_size, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, @@ -142,7 +157,7 @@ void LLAMA::create_llama_model(FFModel &ff, } case INC_DECODING_MODE: { mha = ff.inc_multihead_self_attention( - att_norm, + qkv_proj, llama_config.hidden_size, llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, @@ -168,6 +183,20 @@ void LLAMA::create_llama_model(FFModel &ff, } } + Tensor mha_input = mha; + mha = ff.dense(mha_input, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers_" + std::to_string(i) + "_attn_o_proj") + .c_str()); + // step 2: SILU activaion Tensor token_ff_norm[2] = {nullptr, nullptr}; ff.residual_rms_norm( diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 5d52034575..1d5528f759 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -142,7 +142,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, for (int i = 0; i < numdims; i++) { dims[i] = input->dims[i]; } - dims[0] = embed_dim; + dims[0] = vdim * num_kv_heads; // we now output o_proj_dim * o_heads li->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, li, 0, true /*create_grad*/); } diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index d38f93558e..1fd2564013 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -246,7 +246,7 @@ template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, - DT const *input_ptr, + // DT const *input_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, @@ -277,25 +277,26 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int k = m->qSize; int m_ = m_q * QKV_WEIGHT_NUM; int lda = k, ldb = k, ldc = m_; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - hipblas_data_type, - lda, - input_ptr, - hipblas_data_type, - ldb, - &beta, - output_ptr, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + // this projection is done in dense layer, no need for gemm here + // checkCUDA(hipblasGemmEx(m->handle.blas, + // HIPBLAS_OP_T, + // HIPBLAS_OP_N, + // m_, + // n, + // k, + // &alpha, + // weight_ptr, + // hipblas_data_type, + // lda, + // input_ptr, + // hipblas_data_type, + // ldb, + // &beta, + // output_ptr, + // hipblas_data_type, + // ldc, + // compute_type, + // HIPBLAS_GEMM_DEFAULT)); // apply rotary emmmbedding for q and k // step1 change the k, v to complex tensor @@ -303,25 +304,38 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int parallelism = m->kProjSize * num_tokens * m->num_q_heads; size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; // apply bias for q, k, v - if (*m->qkv_bias) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - shard_id, - num_tokens, - m->qProjSize, - m->kProjSize, - m->vProjSize, - m->global_num_q_heads, - m->num_q_heads, - *m->scaling_query, - m->scaling_factor, - m->hidden_size); - } else if (m->scaling_query) { + // if (*m->qkv_bias) { + // hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv
), + // GET_BLOCKS(parallelism), + // min(CUDA_NUM_THREADS, parallelism), + // 0, + // stream, + // output_ptr, + // bias_ptr, + // shard_id, + // num_tokens, + // m->qProjSize, + // m->kProjSize, + // m->vProjSize, + // m->global_num_q_heads, + // m->num_q_heads, + // *m->scaling_query, + // m->scaling_factor, + // m->hidden_size); + // } else if (m->scaling_query) { + // hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel
), + // GET_BLOCKS(parallelism), + // min(CUDA_NUM_THREADS, parallelism), + // 0, + // stream, + // output_ptr, + // num_tokens, + // m->num_q_heads, + // m->qProjSize, + // m->scaling_factor, + // m->hidden_size); + // } + if (m->scaling_query) { hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), @@ -439,7 +453,7 @@ template void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, - DT const *input_ptr, + DT const *qkv_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, @@ -457,11 +471,22 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, sizeof(BatchConfig::PerTokenInfo), hipMemcpyHostToDevice, stream)); + + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, shard_id, - input_ptr, + // input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), bias_ptr, @@ -703,47 +728,51 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, m->kProjSize * m->num_q_heads + m->vProjSize * m->num_q_heads); B = C; - C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; - - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; // what is the shape here? + + // checkCUDA(hipblasGemmEx(m->handle.blas, + // HIPBLAS_OP_T, + // HIPBLAS_OP_T, + // m_, + // n, + // k, + // &alpha, + // A, + // hipblas_data_type, + // lda, + // B, + // hipblas_data_type, + // ldb, + // &beta, + // C, + // hipblas_data_type, + // ldc, + // compute_type, + // HIPBLAS_GEMM_DEFAULT)); tokens_previous_requests += num_new_tokens; } - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - num_tokens, - qkv_weight_size, - m->oProjSize); - } - + // if (*m->final_bias && shard_id == 0) { + // int parallelism = m->oProjSize * num_tokens; + // int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + // m->kProjSize * m->global_num_q_heads + + // m->vProjSize * m->global_num_q_heads; + // hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), + // GET_BLOCKS(parallelism), + // min(CUDA_NUM_THREADS, parallelism), + // 0, + // stream, + // output_ptr, + // bias_ptr, + // num_tokens, + // qkv_weight_size, + // m->oProjSize); + // } + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); assert(tokens_previous_requests == num_tokens); } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index aa98dc4964..8334dc0636 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -640,6 +640,139 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->hidden_size); } + + // Step 3: apply rotary embedding if needed + if (*m->apply_rotary_embedding) { + /*q&k*/ + parallelism = num_tokens * m->hidden_size; + apply_rotary_embedding_hf<<>>(output_ptr, + m->complex_input, + m->token_infos, + m->qProjSize, + m->kProjSize, + num_tokens, + q_array_size, + m->hidden_size); + } +} + +template +void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + // DT const *input_ptr, we no longer use the raw input + DT const *weight_ptr, + DT *output_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + assert(m->qSize == m->vSize && m->qSize == m->kSize); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + cudaDataType_t compute_type = cublas_data_type; +#else + // For best performance, set the default cublas compute type to + // CUBLAS_COMPUTE_16F for half precision and to + // CUBLAS_COMPUTE_32F_FAST_16F for full precision + cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + if (m->output_type[0] == DT_FLOAT) { + compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + } +#endif + + // this block is deleted so that dense operator are done in model, + // which allows for peft on qkv projection + // // Step 1: Compute QKV projections + // { + // DT alpha = 1.0f, beta = 0.0f; + // // after transpositions + // int m_q = m->qProjSize * m->num_q_heads; + // int m_k = m->kProjSize * m->num_q_heads; + // int m_v = m->vProjSize * m->num_q_heads; + // assert(m_q == m_k && m_k == m_v); // keep things simple for now + // int n = bc->num_active_tokens(); + // int k = m->qSize; + // int m_ = m_q * QKV_WEIGHT_NUM; + // // before transpositions + // int lda = k, ldb = k, ldc = m_; + // // matrix A: QKV weights + // // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] + // // matrix B: input + // // matrix B's layout: [qSize (hidden_dim), num_new_tokens] + // // matrix C: devQKVProjArray + // // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] + // checkCUDA(cublasGemmEx(m->handle.blas, + // CUBLAS_OP_T, + // CUBLAS_OP_N, + // m_, + // n, + // k, + // &alpha, + // weight_ptr, + // cublas_data_type, + // lda, + // input_ptr, + // cublas_data_type, + // ldb, + // &beta, + // output_ptr, + // cublas_data_type, + // ldc, + // compute_type, + // CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // } + + int num_tokens = bc->num_active_tokens(); + int parallelism = m->kProjSize * num_tokens * m->num_q_heads; + size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; + + // Step 2: apply bias for QKV, or scale the query + // this are handled in the dense layer with bias, but we still need to handle scaling + // if (*m->qkv_bias) { + // apply_proj_bias_qkv<<>>(output_ptr, + // bias_ptr, + // shard_id, + // num_tokens, + // m->qProjSize, + // m->kProjSize, + // m->vProjSize, + // m->global_num_q_heads, + // m->num_q_heads, + // *m->scaling_query, + // m->scaling_factor, + // m->hidden_size); + // } else if (m->scaling_query) { + // scaling_query_kernel<<>>(output_ptr, + // num_tokens, + // m->num_q_heads, + // m->qProjSize, + // m->scaling_factor, + // m->hidden_size); + // } + + if (m->scaling_query) { + scaling_query_kernel<<>>(output_ptr, + num_tokens, + m->num_q_heads, + m->qProjSize, + m->scaling_factor, + m->hidden_size); + } + // Step 3: apply rotary embedding if needed if (*m->apply_rotary_embedding) { /*q&k*/ @@ -860,7 +993,7 @@ template void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, - DT const *input_ptr, + DT const *qkv_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, @@ -872,11 +1005,23 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m, bias_ptr = static_cast
(m->bias_ptr); } + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + + compute_qkv_kernel(m, bc, shard_id, - input_ptr, + // input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), bias_ptr, @@ -897,8 +1042,18 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m, // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); - compute_o_prod_bias( - m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); + + // this dense layer (with bias) is done in the model by seperate dense layer + // compute_o_prod_bias( + // m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); + + // simply copy the result to output_ptr + // TODO: change the meta for output, maybe transpose here? + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); } std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index aebd5e8892..5c5cade645 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -414,45 +414,50 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + // checkCUDA(hipblasGemmEx(m->handle.blas, + // HIPBLAS_OP_T, + // HIPBLAS_OP_T, + // m_, + // n, + // k, + // &alpha, + // A, + // hipblas_data_type, + // lda, + // B, + // hipblas_data_type, + // ldb, + // &beta, + // C, + // hipblas_data_type, + // ldc, + // compute_type, + // HIPBLAS_GEMM_DEFAULT)); tokens_previous_requests += num_new_tokens; tokens_prev_requests_squares += num_new_tokens * total_tokens; } } - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - num_tokens, - qkv_weight_size, - m->oProjSize); - } + // if (*m->final_bias && shard_id == 0) { + // int parallelism = m->oProjSize * num_tokens; + // int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + // m->kProjSize * m->global_num_q_heads + + // m->vProjSize * m->global_num_q_heads; + // hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), + // GET_BLOCKS(parallelism), + // min(CUDA_NUM_THREADS, parallelism), + // 0, + // stream, + // output_ptr, + // bias_ptr, + // num_tokens, + // qkv_weight_size, + // m->oProjSize); + // } + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); assert(tokens_previous_requests == num_tokens); } @@ -461,7 +466,7 @@ template void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, int shard_id, - DT const *input_ptr, + DT const *qkv_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, @@ -494,11 +499,21 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), hipMemcpyHostToDevice, stream)); + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, shard_id, - input_ptr, + // input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), bias_ptr, diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 4688a8233c..ce8055286e 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -698,17 +698,27 @@ template void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, int shard_id, - DT const *input_ptr, + DT const *qkv_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, cudaStream_t stream) { + + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, shard_id, - input_ptr, + // input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), bias_ptr, @@ -728,8 +738,13 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); - compute_o_prod_bias( - m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); + // compute_o_prod_bias( + // m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); } } // namespace SpecIncMultiHeadSelfAttention diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 03e0ac6441..24476063b2 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -391,47 +391,52 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, C = static_cast
(output_ptr) + processed_tokens_in_batch * m->oProjSize; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + // checkCUDA(hipblasGemmEx(m->handle.blas, + // HIPBLAS_OP_T, + // HIPBLAS_OP_T, + // m_, + // n, + // k, + // &alpha, + // A, + // hipblas_data_type, + // lda, + // B, + // hipblas_data_type, + // ldb, + // &beta, + // C, + // hipblas_data_type, + // ldc, + // compute_type, + // HIPBLAS_GEMM_DEFAULT)); processed_tokens_in_batch += num_new_tokens; } // Before moving to the next request // check that we have finished all tokens of the request assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * processed_tokens_in_batch; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - processed_tokens_in_batch, - qkv_weight_size, - m->oProjSize); - } + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * processed_tokens_in_batch * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); + // if (*m->final_bias && shard_id == 0) { + // int parallelism = m->oProjSize * processed_tokens_in_batch; + // int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + // m->kProjSize * m->global_num_q_heads + + // m->vProjSize * m->global_num_q_heads; + // hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), + // GET_BLOCKS(parallelism), + // min(CUDA_NUM_THREADS, parallelism), + // 0, + // stream, + // output_ptr, + // bias_ptr, + // processed_tokens_in_batch, + // qkv_weight_size, + // m->oProjSize); + // } assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); } @@ -440,7 +445,7 @@ template void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, int shard_id, - DT const *input_ptr, + DT const *qkv_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, @@ -490,11 +495,21 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, sizeof(TreeVerifyBatchConfig::PerTokenInfo), hipMemcpyHostToDevice, stream)); + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, shard_id, - input_ptr, + // input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), bias_ptr, diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 02f39192df..1dd5773ef4 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -874,7 +874,7 @@ template void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, int shard_id, - DT const *input_ptr, + DT const *qkv_ptr, DT const *weight_ptr, DT *output_ptr, DT const *bias_ptr, @@ -915,11 +915,21 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); bias_ptr = static_cast
(m->bias_ptr); } + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, shard_id, - input_ptr, + // input_ptr, weight_ptr, static_cast
(m->devQKVProjArray), bias_ptr, @@ -934,14 +944,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, int processed_tokens_in_batch = bc->num_active_tokens(); - compute_o_prod_bias(m, - bc, - shard_id, - output_ptr, - weight_ptr, - bias_ptr, - processed_tokens_in_batch, - stream); + // compute_o_prod_bias(m, + // bc, + // shard_id, + // output_ptr, + // weight_ptr, + // bias_ptr, + // processed_tokens_in_batch, + // stream); + int num_tokens = bc->num_active_tokens(); + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); } } // namespace TreeIncMultiHeadAttention diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index c373e0da9b..a66a6097ea 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -127,6 +127,59 @@ void load_attention_weights_multi_query(DT *ptr, } } +template +void load_attention_o_proj_bias_to_dense_v2(DT *ptr, + int num_heads, + int num_kv_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weights_folder) { + std::string filename = layer_name + "_wo_bias"; + + int file_index = 0; + + // now only opt use this. + // assert(num_heads == num_kv_heads); + int idx = 0; + + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + + int n_heads = num_heads; + + int replicate_num = num_heads / num_kv_heads; + + size_t out_partial_size = hidden_dim; + size_t partial_size = out_partial_size; + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + assert(in.good() && "incorrect bias file path"); + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + printf( + "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n", + in_get_size, + loaded_data_size); + assert(false); + } + assert(partial_size == host_array.size()); + + size_t data_index = 0; + + for (int i = 0; i < partial_size; i++) { + ptr[i] = host_array.at(data_index); + data_index++; + } + + in.close(); +} + template void load_attention_bias_v2(DT *ptr, int num_heads, @@ -207,6 +260,140 @@ void load_attention_bias_v2(DT *ptr, } } +template +void load_attention_weights_to_dense_v2(DT *ptr, + int num_heads, + int num_kv_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weights_folder, + size_t volume, + int tensor_parallelism_degree, + bool load_o_proj) { + // layers_0_attention_wq_weight + // layers_0_self_attn_q_proj_weight + std::string q_file = layer_name + "_wq_weight"; + std::string k_file = layer_name + "_wk_weight"; + std::string v_file = layer_name + "_wv_weight"; + std::string o_file = layer_name + "_wo_weight"; + std::vector weight_filenames = {q_file, k_file, v_file}; + int file_index = 0; + + int base_index = 0; + size_t single_proj_size = + hidden_dim * + qkv_inner_dim; // size of each of Q,K,V,O weights for a single head + size_t one_weight_file_size = + num_heads * single_proj_size; // size of each of Q/K/V/O for all heads + + size_t q_size = one_weight_file_size, o_size = one_weight_file_size; + size_t k_size = single_proj_size * num_kv_heads, + v_size = single_proj_size * num_kv_heads; + + size_t k_replicate_size = one_weight_file_size; + size_t v_replicate_size = one_weight_file_size; + + int replicate_num = num_heads / num_kv_heads; + + // stride for q, k, v, o + size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) / + tensor_parallelism_degree; + if(!load_o_proj) { + for (auto filename : weight_filenames) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + + int data_index = 0; + size_t partial_size = (file_index == 0 || file_index == 3) + ? one_weight_file_size + : single_proj_size * num_kv_heads; + size_t one_partition_size = + one_weight_file_size / tensor_parallelism_degree; + + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << weight_filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load attention data error " << in_get_size << ", " + << loaded_data_size << ", " << file_index << ", " + << weight_filepath << "\n"; + assert(false && "data size mismatch"); + } + // wq, wk, wo + if (file_index == 0) { + for (int i = 0; i < tensor_parallelism_degree; i++) { + for (int j = 0; j < one_partition_size; j++) { + ptr[base_index + i * stride_size + j] = host_array.at(data_index++); + } + } + } else { + for (int i = 0; i < num_heads; i++) { + int kv_idx = i / (num_heads / num_kv_heads); + int head_idx = i % (num_heads / tensor_parallelism_degree); + int tp_idx = (i / (num_heads / tensor_parallelism_degree)); + for (int j = 0; j < single_proj_size; j++) { + ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx + + j] = host_array.at(kv_idx * single_proj_size + j); + } + } + } + + // assert(data_index == partial_size); + base_index += one_partition_size; + file_index++; + } + assert(base_index == (q_size + k_replicate_size + v_replicate_size) / + tensor_parallelism_degree); + } else { + std::cout << "Loading weight file " << o_file << std::endl; + std::string weight_filepath = join_path({weights_folder, o_file}); + + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << weight_filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(one_weight_file_size); + size_t loaded_data_size = sizeof(DT) * one_weight_file_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + std::cout << "load data error" << std::endl; + assert(false); + } + assert(one_weight_file_size == host_array.size()); + int data_index = 0; + + int one_partition_size = + qkv_inner_dim * (num_heads / tensor_parallelism_degree); + for (int i = 0; i < one_weight_file_size; i++) { + int part_idx = (i / one_partition_size) % tensor_parallelism_degree; + int block_num = (i / one_partition_size); + int offset = block_num / tensor_parallelism_degree * one_partition_size + + (i % one_partition_size); + ptr[part_idx * stride_size + offset] = + host_array.at(data_index++); + } + + in.close(); + + assert(data_index == one_weight_file_size); + } +} + template void load_attention_weights_v2(DT *ptr, int num_heads, @@ -720,6 +907,21 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, DT *data = (DT *)malloc(sizeof(DT) * volume); std::string weight_filename = removeGuidOperatorName(std::string(l->name)); + bool is_attn_proj = false, is_o_proj = false; + + if (weight_filename.find("_proj") != std::string::npos) { + size_t pos = weight_filename.find("_attn_o_proj"); + if (pos != std::string::npos) { + weight_filename.replace(pos, std::string("_attn_o_proj").length(), "_attention"); + is_o_proj = true; + } else { + pos = weight_filename.find("_attn_qkv_proj"); + assert(pos != std::string::npos); + weight_filename.replace(pos, std::string("_attn_qkv_proj").length(), "_attention"); + } + is_attn_proj = true; + } + if (ff->config.benchmarking) { std::cout << "Initializing weight " << weight_filename @@ -753,6 +955,52 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, weight_filename, weights_folder); } + } else if(is_attn_proj) { + if(is_o_proj) { + if(weight_idx == 0) { + load_attention_weights_to_dense_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree, + true); + } else { + load_attention_o_proj_bias_to_dense_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder); + + } + } else { + if(weight_idx == 0) { + load_attention_weights_to_dense_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree, + false); + } else { + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + false, // do not load o_proj bias + weight_filename, + weights_folder); + } + } } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); weight_filename += (weight_idx == 0) From 94e156321c9a8c517e24feb09ea7137eb3171962 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 10 Jul 2024 05:31:46 +0000 Subject: [PATCH 198/198] fixed filename problem from renaming weight file --- inference/models/llama.cc | 4 ++-- src/runtime/file_loader.cc | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/inference/models/llama.cc b/inference/models/llama.cc index e4b2e5a537..d3319a8a5d 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -103,7 +103,7 @@ void LLAMA::create_llama_model(FFModel &ff, nullptr, // ? REG_MODE_NONE, // no regularization 0.0f, // no dropout - std::string("layers_" + std::to_string(i) + "_attn_qkv_proj") + std::string("layers." + std::to_string(i) + ".attn_qkv_proj") .c_str() ); @@ -194,7 +194,7 @@ void LLAMA::create_llama_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_attn_o_proj") + std::string("layers." + std::to_string(i) + ".attn_o_proj") .c_str()); // step 2: SILU activaion diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index a66a6097ea..4ed31eb4dd 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -135,7 +135,7 @@ void load_attention_o_proj_bias_to_dense_v2(DT *ptr, size_t qkv_inner_dim, std::string layer_name, std::string weights_folder) { - std::string filename = layer_name + "_wo_bias"; + std::string filename = layer_name + ".o_proj.bias"; int file_index = 0; @@ -273,10 +273,10 @@ void load_attention_weights_to_dense_v2(DT *ptr, bool load_o_proj) { // layers_0_attention_wq_weight // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file}; int file_index = 0; @@ -909,15 +909,18 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, std::string weight_filename = removeGuidOperatorName(std::string(l->name)); bool is_attn_proj = false, is_o_proj = false; - if (weight_filename.find("_proj") != std::string::npos) { - size_t pos = weight_filename.find("_attn_o_proj"); + if (weight_filename.find("attn_") != std::string::npos) { + size_t pos = weight_filename.find(".attn_o_proj"); if (pos != std::string::npos) { - weight_filename.replace(pos, std::string("_attn_o_proj").length(), "_attention"); + weight_filename.replace(pos, std::string(".attn_o_proj").length(), ".self_attn"); is_o_proj = true; } else { - pos = weight_filename.find("_attn_qkv_proj"); + pos = weight_filename.find(".attn_qkv_proj"); + if(pos == std::string::npos) { + cout<