From 5bd71236c76ac497466602550b1bc9de884fd1b3 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 30 Mar 2024 14:09:26 -0400 Subject: [PATCH 01/44] run CI per commit only on inference branch --- .github/workflows/gpu-ci.yml | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 48dcda157e..7bdb6805a8 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -1,25 +1,8 @@ name: "gpu-ci" on: - pull_request: - paths: - - "cmake/**" - - "config/**" - - "deps/**" - - "python/**" - - "setup.py" - - "include/**" - - "inference/**" - - "src/**" - - "tests/inference/**" - - "conda/flexflow.yml" - - ".github/workflows/gpu-ci.yml" - - "tests/cpp_gpu_tests.sh" - - "tests/inference_tests.sh" - - "tests/training_tests.sh" - - "tests/python_interface_test.sh" push: branches: - - "master" + - "inference" paths: - "cmake/**" - "config/**" @@ -194,7 +177,7 @@ jobs: - name: Save inference output as an artifact if: always() - run: | + run: | cd inference tar -zcvf output.tar.gz ./output From e0a6e4fee228ca31a74e69dd84d73e01762214a1 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 30 Mar 2024 14:29:47 -0400 Subject: [PATCH 02/44] fix --- python/flexflow/serve/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 14555bfc12..cbc4122897 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -375,7 +375,7 @@ def compile( self.rm.set_max_spec_tree_token_num( self.model_configs.max_spec_tree_token_num if "max_spec_tree_token_num" - in self.model_configs.max_spec_tree_token_num.__dict__ + in self.model_configs.__dict__ else 20 ) From 1210256080072935fecd71dbf7cbfb31d9f99efa Mon Sep 17 00:00:00 2001 From: Zhuofu Chen <59316330+aetiurf@users.noreply.github.com> Date: Sat, 6 Apr 2024 22:02:15 +0800 Subject: [PATCH 03/44] fix: 'model_configs' AttributeError (#1358) --- python/flexflow/serve/serve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index cbc4122897..ac622b3337 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -373,9 +373,9 @@ def compile( model_configs = self.config_class(self.hf_config) self.rm.set_max_spec_tree_token_num( - self.model_configs.max_spec_tree_token_num + model_configs.max_spec_tree_token_num if "max_spec_tree_token_num" - in self.model_configs.__dict__ + in model_configs.__dict__ else 20 ) From b4a639c8990f2d031ee4938f3e7dc8140e4eb324 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 7 Apr 2024 23:26:53 -0400 Subject: [PATCH 04/44] Changes to support Perlmutter environment (#1360) * . * remove deadcode * add benchmarking mode, initializing weights randomly * better logging when running out of memory * update --------- Co-authored-by: Gabriele Oliaro --- cmake/cuda.cmake | 15 ++- config/config.inc | 12 +- config/config.linux | 14 ++- include/flexflow/config.h | 2 +- inference/incr_decoding/incr_decoding.cc | 4 +- inference/models/falcon.cc | 20 ---- inference/models/llama.cc | 10 -- inference/models/mpt.cc | 15 --- inference/models/opt.cc | 18 --- inference/models/starcoder.cc | 10 -- inference/python/incr_decoding.py | 3 +- inference/python/spec_infer.py | 3 +- inference/spec_infer/spec_infer.cc | 4 +- inference/utils/download_hf_model.py | 4 +- python/flexflow/core/__init__.py | 1 + python/flexflow/serve/__init__.py | 8 ++ src/mapper/mapper.cc | 46 ++++++-- src/runtime/file_loader.cc | 109 ++++++++++-------- src/runtime/model.cc | 8 +- .../python_test_configs/generate_configs.py | 3 +- 20 files changed, 159 insertions(+), 150 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 68e4ca07b1..45ecc1798b 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -13,8 +13,19 @@ if(CUDA_FOUND) # set cuda runtime and driver lib # override cublas and curand because the FindCUDA module may not find the correct libs set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT}) - set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT}) - set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT}) + if(CUBLAS_PATH) + set(CUBLAS_ROOT ${CUBLAS_PATH}) + else() + set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + endif() + set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT}) + if(CURAND_PATH) + set(CURAND_ROOT ${CURAND_PATH}) + else() + set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + endif() + set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT}) + list(APPEND FLEXFLOW_EXT_LIBRARIES ${CUDADRV_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} diff --git a/config/config.inc b/config/config.inc index 1121c114c4..7d7b2db9cf 100644 --- a/config/config.inc +++ b/config/config.inc @@ -62,6 +62,16 @@ if [ -n "$CUDA_DIR" ]; then SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}" fi +# set cublas dir +if [ -n "$CUBLAS_DIR" ]; then + SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}" +fi + +# set curand dir +if [ -n "$CURAND_DIR" ]; then + SET_CURAND="-DCURAND_PATH=${CURAND_DIR}" +fi + # set cudnn dir if [ -n "$CUDNN_DIR" ]; then SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}" @@ -231,7 +241,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then fi fi -CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" +CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}" function run_cmake() { SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../} diff --git a/config/config.linux b/config/config.linux index 30edfa7dfe..acffc210f5 100755 --- a/config/config.linux +++ b/config/config.linux @@ -36,12 +36,18 @@ FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"} # or all available architectures. TODO: support autodetect FF_HIP_ARCH=${FF_HIP_ARCH:-"all"} -# set CUDNN dir in case cmake cannot autodetect a path -CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} - # set CUDA dir in case cmake cannot autodetect a path CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"} +# set CUBLAS dir in case it is not stored in the CUDA DIR +CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"} + +# set CURAND dir in case it is not stored in the CUDA DIR +CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"} + +# set CUDNN dir in case cmake cannot autodetect a path +CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"} + # if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib, # otherwise, we will build nccl from source NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"} @@ -102,7 +108,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 17a3f59e29..2c11ae1131 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -145,7 +145,7 @@ class FFConfig { Legion::Runtime *lg_hlr; Legion::IndexSpaceT<1> all_gpu_task_is; // Legion::FieldSpace field_space; - bool syntheticInput, profiling, perform_fusion; + bool benchmarking, profiling, perform_fusion; bool inference_debugging; size_t simulator_work_space_size; size_t search_budget; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index f88af3bc43..aae7256ffe 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -107,7 +107,9 @@ void parse_input_args(char **argv, } } if (paths.cache_folder_path.empty()) { - paths.cache_folder_path = "~/.cache/flexflow"; + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); } // Expand ~ to the home directory if needed wordexp_t p; diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index e00f4e9cfd..a529411ddb 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -252,26 +252,6 @@ void FALCON::create_falcon_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - falcon_config.n_head, - falcon_config.n_head_kv, - falcon_config.hidden_size, - falcon_config.hidden_size / falcon_config.n_head, - ff.config.tensor_parallelism_degree); - std::cout << "------load weights ----------" << std::endl; - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 14b8c31fa1..517f534438 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -277,16 +277,6 @@ void LLAMA::create_llama_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 7e8fc8358f..70e2b5e9c5 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -259,21 +259,6 @@ void MPT::create_mpt_model(FFModel &ff, InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - mpt_config.n_heads, - mpt_config.n_heads, - mpt_config.hidden_size, - mpt_config.hidden_size / mpt_config.n_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 3ff4c96fdf..5677d5658e 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -266,24 +266,6 @@ void OPT::create_opt_model(FFModel &ff, use_full_precision); InferenceManager *im = InferenceManager::get_inference_manager(); im->register_model_weights_loader(&ff, fileloader); - -#ifdef DEADCODE - //------------------- compile the model -------------------------------- - std::cout << "------start compile ----------" << std::endl; - InferenceManager *im = InferenceManager::get_inference_manager(); - im->compile_model_and_allocate_buffer(&ff); - FileDataLoader fileloader("", - weight_file_path, - opt_config.num_attention_heads, - opt_config.num_attention_heads, - opt_config.hidden_size, - opt_config.hidden_size / - opt_config.num_attention_heads, - ff.config.tensor_parallelism_degree); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------finished loading weights----------" << std::endl; - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 2327c86119..8b0dc1098c 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -232,16 +232,6 @@ void STARCODER::create_starcoder_model( ff.config.tensor_parallelism_degree, use_full_precision); im->register_model_weights_loader(&ff, fileloader); -#ifdef DEADCODE - // Compile the model - std::cout << "------start compile ----------" << std::endl; - im->compile_model_and_allocate_buffer(&ff); - fileloader.load_weights(&ff, use_full_precision); - std::cout << "------load weight finished----------" << std::endl; - - // init operators - im->init_operators_inference(&ff); -#endif } }; // namespace FlexFlow diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index f7707816c8..05599ea6b9 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -55,6 +55,7 @@ def get_configs(): "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "benchmarking": False, "inference_debugging": False, "fusion": True, } @@ -62,7 +63,7 @@ def get_configs(): # required parameters "llm_model": "tiiuae/falcon-7b", # optional parameters - "cache_path": "", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, "full_precision": False, "prompt": "", diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index fcb1b8f891..a6dfa8042e 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -55,6 +55,7 @@ def get_configs(): "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "benchmarking": False, "inference_debugging": False, "fusion": True, } @@ -62,7 +63,7 @@ def get_configs(): # required llm arguments "llm_model": "meta-llama/Llama-2-7b-hf", # optional llm parameters - "cache_path": "", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, "full_precision": False, "ssms": [ diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index b6c1e408cd..f7edfd7696 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -124,7 +124,9 @@ void parse_input_args(char **argv, } } if (paths.cache_folder_path.empty()) { - paths.cache_folder_path = "~/.cache/flexflow"; + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); } // Expand ~ to the home directory if needed wordexp_t p; diff --git a/inference/utils/download_hf_model.py b/inference/utils/download_hf_model.py index 94a8c23e68..7b4f4d6fb0 100644 --- a/inference/utils/download_hf_model.py +++ b/inference/utils/download_hf_model.py @@ -1,6 +1,6 @@ #!/usr/bin/env python import flexflow.serve as ff -import argparse +import argparse, os def parse_args(): @@ -12,7 +12,7 @@ def parse_args(): "--cache-folder", type=str, help="Folder to use to store the model(s) assets in FlexFlow format", - default="", + default=os.environ.get("FF_CACHE_PATH", ""), ) parser.add_argument( "--refresh-cache", diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index d7b1a595d2..2820cf485a 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -41,6 +41,7 @@ "num_cpus": "-ll:cpu", "legion_utility_processors": "-ll:util", "profiling": "--profiling", + "benchmarking": "--benchmarking", "inference_debugging": "--inference-debugging", "fusion": "--fusion", "disable_control_replication": "--disable-control-replication", diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index cf467280bd..5af077273d 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -45,6 +45,7 @@ def init( use_4bit_quantization: Optional[bool] = None, use_8bit_quantization: Optional[bool] = None, profiling: Optional[bool] = None, + benchmarking: Optional[bool] = None, inference_debugging: Optional[bool] = None, fusion: Optional[bool] = None, ): @@ -72,6 +73,7 @@ def init( - use_4bit_quantization: whether to use 4-bit quantization, defaults to False - use_8bit_quantization: whether to use 8-bit quantization, defaults to False - profiling: whether to enable the FlexFlow profiling mode, defaults to False + - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True @@ -106,6 +108,8 @@ def init( :type use_8bit_quantization: Optional[bool], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional + :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False + :type benchmarking: Optional[bool], optional :param inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False :type inference_debugging: Optional[bool], optional :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True @@ -132,6 +136,7 @@ def init( use_4bit_quantization is not None, use_8bit_quantization is not None, profiling is not None, + benchmarking is not None, inference_debugging is not None, fusion is not None, ] @@ -157,6 +162,7 @@ def init( "use_4bit_quantization": use_4bit_quantization, "use_8bit_quantization": use_8bit_quantization, "profiling": profiling, + "benchmarking": benchmarking, "inference_debugging": inference_debugging, "fusion": fusion, } @@ -201,6 +207,8 @@ def init( configs_dict["use_8bit_quantization"] = False if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False + if configs_dict.get("benchmarking", None) is None: + configs_dict["benchmarking"] = False if configs_dict.get("inference_debugging", None) is None: configs_dict["inference_debugging"] = False if configs_dict.get("fusion", None) is None: diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d7aac4e37c..c293aecb19 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -487,6 +487,25 @@ void FFMapper::premap_task(const MapperContext ctx, assert(false); } +std::string humanReadableSize(size_t size, bool mb = false) { + assert(size >= 0); + char const *units[] = {"B", "KiB", "MiB", "GiB", "TiB"}; + int i = 0; + double finalSize = size; + if (mb) { + finalSize /= 1024 * 1024; + i = 2; + } else { + while (finalSize >= 1024 && i < 4) { + finalSize /= 1024; + i++; + } + } + char buffer[256]; + snprintf(buffer, sizeof(buffer), "%.2lf %s", finalSize, units[i]); + return std::string(buffer); +} + void FFMapper::map_task(const MapperContext ctx, Task const &task, MapTaskInput const &input, @@ -637,16 +656,19 @@ void FFMapper::map_task(const MapperContext ctx, } // Report failed to creation log_ff_mapper.error( - "FlexFlow failed allocation of size %zd bytes for " - "region requirement %d of task %s (UID %lld) in memory " IDFMT - " with kind %d for processor " IDFMT ".", - footprint, + "Out of memory! FlexFlow failed to reserve block of size %s" + " for region requirement %d of task %s (UID %lld) in %s memory (id: " + "%llx)" + " for processor id: %llx." + " Total pre-allocated memory capacity of this kind: %s.", + humanReadableSize(footprint).c_str(), idx, task.get_task_name(), task.get_unique_id(), + Legion::Mapping::Utilities::to_string(target_mem.kind()), target_mem.id, - target_mem.kind(), - task.target_proc.id); + task.target_proc.id, + humanReadableSize(target_mem.capacity(), true).c_str()); assert(false); } else { output.chosen_instances[idx].push_back(result); @@ -929,15 +951,17 @@ void FFMapper::map_inline(const MapperContext ctx, created, &footprint)) { log_ff_mapper.error( - "FlexFlow Mapper failed allocation of size %zd bytes" + "Out of memory! FlexFlow failed to reserve block of size %s" " for region requirement of inline mapping in task %s (UID %lld)" - " in memory " IDFMT "for processor " IDFMT ".", - footprint, + " in %s memory (id: %llx) for processor id: %llx." + " Total pre-allocated memory capacity of this kind: %s.", + humanReadableSize(footprint).c_str(), inline_op.parent_task->get_task_name(), inline_op.parent_task->get_unique_id(), + Legion::Mapping::Utilities::to_string(target_memory.kind()), target_memory.id, - inline_op.parent_task->current_proc.id); - printf("target_memory.kind() = %d\n", target_memory.kind()); + inline_op.parent_task->current_proc.id, + humanReadableSize(target_memory.capacity(), true).c_str()); assert(false); } else { output.chosen_instances.push_back(result); diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 56558b3185..43ce9d7005 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -725,60 +725,69 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, std::string weight_filename = removeGuidOperatorName(std::string(l->name)); - if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, weight_filename, weights_folder, hidden_dim, num_heads); - } else if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); + if (ff->config.benchmarking) { + std::cout << "Initializing weight " << weight_filename + << " with random data (benchmarking mode)" << std::endl; + // If benchmarking, we don't need to load the weights + // We can just fill the weight tensor with random data + } else { + if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { + if (weight_filename.find("self_attention") != std::string::npos) { + load_attention_weights_multi_query( + data, weight_filename, weights_folder, hidden_dim, num_heads); + } else if (weight_filename.find("attention") != std::string::npos && + weight_filename.rfind("attention") == + weight_filename.length() - strlen("attention")) { + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); + } else { + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); + } + } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); + assert(false); } - + } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { + assert(weight_idx >= 0 || weight_idx <= 2); + weight_filename += (weight_idx == 0) + ? "_attn_bias" + : ((weight_idx == 1) ? "_weight" : "_bias"); + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = + join_path({weights_folder, weight_filename}); + load_from_file(data, volume, weight_filepath); } else { - assert(false); - } - } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { - assert(weight_idx >= 0 || weight_idx <= 2); - weight_filename += (weight_idx == 0) - ? "_attn_bias" - : ((weight_idx == 1) ? "_weight" : "_bias"); - std::cout << "Loading weight file " << weight_filename << std::endl; - std::string weight_filepath = join_path({weights_folder, weight_filename}); - load_from_file(data, volume, weight_filepath); - } else { - // default op - assert(weight_idx == 0 || weight_idx == 1); - // handle exception - if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + // default op + assert(weight_idx == 0 || weight_idx == 1); + // handle exception + if (weight_filename != "embed_tokens_weight_lm_head") { + weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + } + std::cout << "Loading weight file " << weight_filename << std::endl; + std::string weight_filepath = + join_path({weights_folder, weight_filename}); + load_from_file(data, volume, weight_filepath); } - std::cout << "Loading weight file " << weight_filename << std::endl; - std::string weight_filepath = join_path({weights_folder, weight_filename}); - load_from_file(data, volume, weight_filepath); } // Copy the weight data from the buffer to the weight's ParallelTensor diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 40f758282c..1fa281777a 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4065,6 +4065,7 @@ struct DefaultConfig { // const static int iterations = 1; const static int batchSize = 64; const static bool profiling = false; + const static bool benchmarking = false; const static bool inference_debugging = false; constexpr static float learningRate = 0.01f; constexpr static float weightDecay = 0.0001f; @@ -4100,6 +4101,7 @@ FFConfig::FFConfig() { // iterations = DefaultConfig::iterations; batchSize = DefaultConfig::batchSize; profiling = DefaultConfig::profiling; + benchmarking = DefaultConfig::benchmarking; inference_debugging = DefaultConfig::inference_debugging; learningRate = DefaultConfig::learningRate; weightDecay = DefaultConfig::weightDecay; @@ -4137,7 +4139,7 @@ FFConfig::FFConfig() { export_strategy_computation_graph_file = ""; dataset_path = ""; substitution_json_path = tl::nullopt; - syntheticInput = false; + benchmarking = false; perform_fusion = false; base_optimize_threshold = DefaultConfig::base_optimize_threshold; perform_memory_search = false; @@ -4290,6 +4292,10 @@ void FFConfig::parse_args(char **argv, int argc) { profiling = true; continue; } + if (!strcmp(argv[i], "--benchmarking")) { + benchmarking = true; + continue; + } if (!strcmp(argv[i], "--inference-debugging")) { inference_debugging = true; continue; diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index ebaadade32..41703cf431 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -18,6 +18,7 @@ "use_4bit_quantization": False, "use_8bit_quantization": False, "profiling": False, + "benchmarking": False, "inference_debugging": False, "fusion": True, } @@ -25,7 +26,7 @@ # required parameters "llm_model": "tiiuae/falcon-7b", # optional parameters - "cache_path": "", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, "full_precision": True, "prompt": "", From 7da197e71e31a1840d9404a63d5a9fdd20d4d41e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 23 Apr 2024 20:26:33 -0400 Subject: [PATCH 05/44] update workflow to build rocm docker images --- .github/workflows/docker-build.yml | 58 +++++++++++++----------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 54805cc325..d16179434b 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -20,26 +20,22 @@ concurrency: cancel-in-progress: true jobs: - oracle-runner-start: - name: Start an Oracle instance to build the ROCM Docker images + rocm-builder-start: + name: Start an AWS instance to build the ROCM Docker images runs-on: ubuntu-latest if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: - OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} - OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} - OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} - OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} - OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} - OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }} + ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }} steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 - - name: Install Oracle Cloud Infrastructure library - run: pip install oci - - - name: Start Oracle Machine - run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID + - name: Start EC2 instance + run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID docker-build-rocm: name: Build and Install FlexFlow in a Docker Container (ROCm backend) @@ -66,8 +62,8 @@ jobs: docker-build-and-publish-rocm: name: Build and Deploy FlexFlow Docker Containers (ROCm backend) - needs: oracle-runner-start - runs-on: [self-hosted, cpu_only] + needs: rocm-builder-start + runs-on: [self-hosted, rocm_builder] if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} strategy: matrix: @@ -148,27 +144,23 @@ jobs: ./docker/publish.sh flexflow-environment ./docker/publish.sh flexflow - oracle-runner-stop: + rocm-builder-stop: needs: docker-build-and-publish-rocm if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} runs-on: ubuntu-latest - name: Stop the Oracle instance we used to build the ROCM Docker images + name: Stop the AWS instance we used to build the ROCM Docker images env: - OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }} - OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }} - OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }} - OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }} - OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} - OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }} + ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }} steps: - - name: Checkout Git Repository - uses: actions/checkout@v3 - - - name: Install Oracle Cloud Infrastructure library - run: pip install oci - - - name: Stop Oracle Machine - run: python3 .github/workflows/helpers/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + + - name: Start EC2 instance + run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID notify-slack: name: Notify Slack in case of failure From 002fdf017c7dd665b703da37494093161c3d55c7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 23 Apr 2024 22:35:42 -0400 Subject: [PATCH 06/44] downgrade to python 3.11 for now --- docker/flexflow-environment/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index edbf9a7e52..6ca337f58d 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -17,7 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut # Install Python3 with Miniconda ARG python_version "latest" -RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ +#RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ +RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \ echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \ exit 1; \ From d54e4b6a747f3940a19989a56095a71540e4c0d8 Mon Sep 17 00:00:00 2001 From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com> Date: Wed, 1 May 2024 01:51:57 +0800 Subject: [PATCH 07/44] doc: fix c++ serving example (#1372) Co-authored-by: Gabriele Oliaro --- .github/README.md | 2 +- SERVE.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/README.md b/.github/README.md index 4a2a881c8d..c4f6baada6 100644 --- a/.github/README.md +++ b/.github/README.md @@ -178,7 +178,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` diff --git a/SERVE.md b/SERVE.md index e9bab3d702..9472d50a62 100644 --- a/SERVE.md +++ b/SERVE.md @@ -126,7 +126,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` From b90771a376fddbddf09af3f23e4ecae57911438e Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 30 May 2024 14:24:42 -0700 Subject: [PATCH 08/44] Update README.md --- .github/README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/README.md b/.github/README.md index c4f6baada6..5aba2295d5 100644 --- a/.github/README.md +++ b/.github/README.md @@ -4,12 +4,6 @@ --- -## News🔥: - -* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6 -* [08/16/2023] Adding Starcoder model support -* [08/14/2023] Released Docker images for different CUDA versions - ## What is FlexFlow Serve The high computational and memory requirements of generative large language From 385c118447a8b1451de3641c8ecf437245b9248b Mon Sep 17 00:00:00 2001 From: FelixBrakel Date: Thu, 30 May 2024 23:39:10 +0200 Subject: [PATCH 09/44] Add examples for every layer in the python layer API (#1297) * Fix incorrect innode being checked * Add example for every layer on the FFModel python class --------- Co-authored-by: Gabriele Oliaro Co-authored-by: Zhihao Jia --- docs/source/python/layers.rst | 2 +- examples/python/native/ops/add.py | 45 ++++++++ .../ops/add_bias_residual_layer_norm.py | 78 +++++++++++++ examples/python/native/ops/arg_top_k.py | 61 ++++++++++ examples/python/native/ops/argmax.py | 55 +++++++++ examples/python/native/ops/batch_matmul.py | 0 examples/python/native/ops/batch_norm.py | 36 ++++++ examples/python/native/ops/beam_top_k.py | 58 ++++++++++ examples/python/native/ops/concat.py | 43 +++++++ examples/python/native/ops/conv2d.py | 45 ++++++++ examples/python/native/ops/cos.py | 44 +++++++ examples/python/native/ops/dense.py | 38 +++++++ examples/python/native/ops/divide.py | 48 ++++++++ examples/python/native/ops/dropout.py | 49 ++++++++ examples/python/native/ops/elu.py | 47 ++++++++ examples/python/native/ops/embedding.py | 39 +++++++ examples/python/native/ops/exp.py | 0 examples/python/native/ops/flat.py | 0 examples/python/native/ops/gather.py | 60 ++++++++++ examples/python/native/ops/gelu.py | 51 +++++++++ examples/python/native/ops/identity.py | 49 ++++++++ .../ops/inc_multihead_self_attention.py | 103 +++++++++++++++++ .../inc_multihead_self_attention_verify.py | 103 +++++++++++++++++ .../ops/inc_multiquery_self_attention.py | 107 ++++++++++++++++++ .../inc_multiquery_self_attention_verify.py | 107 ++++++++++++++++++ examples/python/native/ops/layer_norm.py | 48 ++++++++ examples/python/native/ops/max.py | 54 +++++++++ examples/python/native/ops/mean.py | 48 ++++++++ examples/python/native/ops/min.py | 54 +++++++++ .../python/native/ops/multihead_attention.py | 0 examples/python/native/ops/multiply.py | 45 ++++++++ examples/python/native/ops/pool2d.py | 36 ++++++ examples/python/native/ops/pow.py | 46 ++++++++ examples/python/native/ops/reduce_sum.py | 48 ++++++++ examples/python/native/ops/relu.py | 46 ++++++++ examples/python/native/ops/reshape.py | 41 +++++++ .../python/native/ops/residual_layer_norm.py | 93 +++++++++++++++ .../python/native/ops/residual_rms_norm.py | 80 +++++++++++++ examples/python/native/ops/reverse.py | 37 ++++++ examples/python/native/ops/rms_norm.py | 64 +++++++++++ examples/python/native/ops/rsqrt.py | 44 +++++++ examples/python/native/ops/sampling.py | 55 +++++++++ examples/python/native/ops/scalar_add.py | 53 +++++++++ examples/python/native/ops/scalar_multiply.py | 53 +++++++++ examples/python/native/ops/scalar_sub.py | 53 +++++++++ .../python/native/ops/scalar_true_divide.py | 53 +++++++++ examples/python/native/ops/sigmoid.py | 46 ++++++++ .../python/native/ops/sigmoid_silu_multi.py | 58 ++++++++++ examples/python/native/ops/sin.py | 44 +++++++ examples/python/native/ops/softmax.py | 46 ++++++++ .../ops/spec_inc_multihead_self_attention.py | 103 +++++++++++++++++ .../ops/spec_inc_multiquery_self_attention.py | 107 ++++++++++++++++++ examples/python/native/ops/split.py | 47 ++++++++ examples/python/native/ops/subtract.py | 45 ++++++++ examples/python/native/ops/tanh.py | 46 ++++++++ examples/python/native/ops/transpose.py | 38 +++++++ 56 files changed, 2898 insertions(+), 1 deletion(-) create mode 100644 examples/python/native/ops/add.py create mode 100644 examples/python/native/ops/add_bias_residual_layer_norm.py create mode 100644 examples/python/native/ops/arg_top_k.py create mode 100644 examples/python/native/ops/argmax.py create mode 100644 examples/python/native/ops/batch_matmul.py create mode 100644 examples/python/native/ops/batch_norm.py create mode 100644 examples/python/native/ops/beam_top_k.py create mode 100644 examples/python/native/ops/concat.py create mode 100644 examples/python/native/ops/conv2d.py create mode 100644 examples/python/native/ops/cos.py create mode 100644 examples/python/native/ops/dense.py create mode 100644 examples/python/native/ops/divide.py create mode 100644 examples/python/native/ops/dropout.py create mode 100644 examples/python/native/ops/elu.py create mode 100644 examples/python/native/ops/embedding.py create mode 100644 examples/python/native/ops/exp.py create mode 100644 examples/python/native/ops/flat.py create mode 100644 examples/python/native/ops/gather.py create mode 100644 examples/python/native/ops/gelu.py create mode 100644 examples/python/native/ops/identity.py create mode 100644 examples/python/native/ops/inc_multihead_self_attention.py create mode 100644 examples/python/native/ops/inc_multihead_self_attention_verify.py create mode 100644 examples/python/native/ops/inc_multiquery_self_attention.py create mode 100644 examples/python/native/ops/inc_multiquery_self_attention_verify.py create mode 100644 examples/python/native/ops/layer_norm.py create mode 100644 examples/python/native/ops/max.py create mode 100644 examples/python/native/ops/mean.py create mode 100644 examples/python/native/ops/min.py create mode 100644 examples/python/native/ops/multihead_attention.py create mode 100644 examples/python/native/ops/multiply.py create mode 100644 examples/python/native/ops/pool2d.py create mode 100644 examples/python/native/ops/pow.py create mode 100644 examples/python/native/ops/reduce_sum.py create mode 100644 examples/python/native/ops/relu.py create mode 100644 examples/python/native/ops/reshape.py create mode 100644 examples/python/native/ops/residual_layer_norm.py create mode 100644 examples/python/native/ops/residual_rms_norm.py create mode 100644 examples/python/native/ops/reverse.py create mode 100644 examples/python/native/ops/rms_norm.py create mode 100644 examples/python/native/ops/rsqrt.py create mode 100644 examples/python/native/ops/sampling.py create mode 100644 examples/python/native/ops/scalar_add.py create mode 100644 examples/python/native/ops/scalar_multiply.py create mode 100644 examples/python/native/ops/scalar_sub.py create mode 100644 examples/python/native/ops/scalar_true_divide.py create mode 100644 examples/python/native/ops/sigmoid.py create mode 100644 examples/python/native/ops/sigmoid_silu_multi.py create mode 100644 examples/python/native/ops/sin.py create mode 100644 examples/python/native/ops/softmax.py create mode 100644 examples/python/native/ops/spec_inc_multihead_self_attention.py create mode 100644 examples/python/native/ops/spec_inc_multiquery_self_attention.py create mode 100644 examples/python/native/ops/split.py create mode 100644 examples/python/native/ops/subtract.py create mode 100644 examples/python/native/ops/tanh.py create mode 100644 examples/python/native/ops/transpose.py diff --git a/docs/source/python/layers.rst b/docs/source/python/layers.rst index 91f12094e6..1be91a8b17 100644 --- a/docs/source/python/layers.rst +++ b/docs/source/python/layers.rst @@ -3,7 +3,7 @@ Layers API ********** Layers are the basic building blocks of neural networks in FlexFlow. The inputs of a layer consists of a tensor or a list of tensors and some state variables, -and the outputs of a layer is a tensor or a list of tensors. +and the outputs of a layer is a tensor or a list of tensors. See https://github.com/flexflow/FlexFlow/examples/python/native/ops for an example for every layer .. automodule:: flexflow.core.flexflow_cffi :noindex: diff --git a/examples/python/native/ops/add.py b/examples/python/native/ops/add.py new file mode 100644 index 0000000000..50b9d16fd0 --- /dev/null +++ b/examples/python/native/ops/add.py @@ -0,0 +1,45 @@ +# The basis for this test of the 'add' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_add(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.add(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + _ = test_add(ffconfig, input1, input2) diff --git a/examples/python/native/ops/add_bias_residual_layer_norm.py b/examples/python/native/ops/add_bias_residual_layer_norm.py new file mode 100644 index 0000000000..6e8dffbc9e --- /dev/null +++ b/examples/python/native/ops/add_bias_residual_layer_norm.py @@ -0,0 +1,78 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_add_bias_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + residual_tensor = ffmodel.create_tensor(residual_arr.shape, DataType.DT_FLOAT) + + output_tensor, layer_norm_output = ffmodel.add_bias_residual_layer_norm( + input_tensor, + residual_tensor, + axes=axes, + elementwise_affine=elementwise_affine, + eps=eps, + use_bias=use_bias, + name="add_bias_residual_layer_norm_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + dataloader_residual = ffmodel.create_data_loader(residual_tensor, residual_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_residual.reset() + + dataloader_input.next_batch(ffmodel) + dataloader_residual.next_batch(ffmodel) + + ffmodel.forward() + + output_tensor.inline_map(ffmodel, ffconfig) + layer_norm_output.inline_map(ffmodel, ffconfig) + output_result = output_tensor.get_array(ffmodel, ffconfig) + layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig) + + return output_result, layer_norm_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + residual_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + axes_to_normalize = [1, 2] # Example axes to normalize + + output_result, layer_norm_result = test_add_bias_residual_layer_norm( + ffconfig, + input_data, + residual_data, + axes=axes_to_normalize, + elementwise_affine=True, + eps=1e-5, + use_bias=True + ) + + print("Input Array:") + print(input_data) + print("\nResidual Array:") + print(residual_data) + print(f"\nOutput Array after applying add_bias_residual_layer_norm along axes {axes_to_normalize}:") + print(output_result) + print("\nLayer Norm Result:") + print(layer_norm_result) diff --git a/examples/python/native/ops/arg_top_k.py b/examples/python/native/ops/arg_top_k.py new file mode 100644 index 0000000000..79edc5dfad --- /dev/null +++ b/examples/python/native/ops/arg_top_k.py @@ -0,0 +1,61 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_arg_top_k(ffconfig, input_arr: np.ndarray, k: int, sorted: bool, speculative_decoding: bool, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + arg_top_k_output = ffmodel.arg_top_k( + input_tensor, + k, + sorted, + speculative_decoding, + name="arg_top_k_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_MEAN_SQUARED_ERROR, + metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR], + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + arg_top_k_output.inline_map(ffmodel, ffconfig) + output_result = arg_top_k_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + k_value = 5 + sorted_value = True + speculative_decoding_value = False # Example value for speculative_decoding + + output_result = test_arg_top_k( + ffconfig, + input_data, + k=k_value, + sorted=sorted_value, + speculative_decoding=speculative_decoding_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying arg_top_k:") + print(output_result) diff --git a/examples/python/native/ops/argmax.py b/examples/python/native/ops/argmax.py new file mode 100644 index 0000000000..dda0e6b0bc --- /dev/null +++ b/examples/python/native/ops/argmax.py @@ -0,0 +1,55 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_argmax(ffconfig, input_arr: np.ndarray, beam_search: bool, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + argmax_output = ffmodel.argmax( + input_tensor, + beam_search, + name="argmax_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + argmax_output.inline_map(ffmodel, ffconfig) + output_result = argmax_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + beam_search_value = True # Set to True or False based on your requirement + + output_result = test_argmax( + ffconfig, + input_data, + beam_search=beam_search_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying argmax:") + print(output_result) diff --git a/examples/python/native/ops/batch_matmul.py b/examples/python/native/ops/batch_matmul.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/batch_norm.py b/examples/python/native/ops/batch_norm.py new file mode 100644 index 0000000000..b243e79d37 --- /dev/null +++ b/examples/python/native/ops/batch_norm.py @@ -0,0 +1,36 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def ff(ffconfig, input_arr: np.ndarray): + ffmodel = FFModel(ffconfig) + # TODO: convert input to ff tensor + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.batch_norm( + input_tensor + ) + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = ff(ffconfig, input) diff --git a/examples/python/native/ops/beam_top_k.py b/examples/python/native/ops/beam_top_k.py new file mode 100644 index 0000000000..cb2fdfb3d2 --- /dev/null +++ b/examples/python/native/ops/beam_top_k.py @@ -0,0 +1,58 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_beam_top_k(ffconfig, input_arr: np.ndarray, max_beam_size: int, sorted: bool, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + beam_top_k_output = ffmodel.beam_top_k( + input_tensor, + max_beam_size, + sorted, + name="beam_top_k_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + beam_top_k_output.inline_map(ffmodel, ffconfig) + output_result = beam_top_k_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + max_beam_size_value = 3 + sorted_value = True + + output_result = test_beam_top_k( + ffconfig, + input_data, + max_beam_size=max_beam_size_value, + sorted=sorted_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying beam_top_k:") + print(output_result) diff --git a/examples/python/native/ops/concat.py b/examples/python/native/ops/concat.py new file mode 100644 index 0000000000..0088d7b848 --- /dev/null +++ b/examples/python/native/ops/concat.py @@ -0,0 +1,43 @@ +# The basis for this test of the 'concatenate' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_concatenate(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.concat([input_tensor1, input_tensor2], axis=1) + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_concatenate(ffconfig, input1, input2) diff --git a/examples/python/native/ops/conv2d.py b/examples/python/native/ops/conv2d.py new file mode 100644 index 0000000000..02b3646aaa --- /dev/null +++ b/examples/python/native/ops/conv2d.py @@ -0,0 +1,45 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def ff(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.conv2d( + input_tensor, + 32, + 3, + 3, + 1, + 1, + 1, + 1, + use_bias=False + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = ff(ffconfig, input) diff --git a/examples/python/native/ops/cos.py b/examples/python/native/ops/cos.py new file mode 100644 index 0000000000..26f6307685 --- /dev/null +++ b/examples/python/native/ops/cos.py @@ -0,0 +1,44 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_cos(ffconfig, input_arr: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + cos_output = ffmodel.cos(input_tensor, name="cos_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + cos_output.inline_map(ffmodel, ffconfig) + cos_result = cos_output.get_array(ffmodel, ffconfig) + + return cos_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + cos_result = test_cos(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying cos function:") + print(cos_result) diff --git a/examples/python/native/ops/dense.py b/examples/python/native/ops/dense.py new file mode 100644 index 0000000000..ec0a3dc65b --- /dev/null +++ b/examples/python/native/ops/dense.py @@ -0,0 +1,38 @@ +# The basis for this test of the 'dense' layer is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_dense(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.dense(input_tensor, 64, activation=ActiMode.AC_MODE_RELU) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + _ = test_dense(ffconfig, input) diff --git a/examples/python/native/ops/divide.py b/examples/python/native/ops/divide.py new file mode 100644 index 0000000000..419bf714ab --- /dev/null +++ b/examples/python/native/ops/divide.py @@ -0,0 +1,48 @@ +# The basis for this test of the 'divide' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_divide(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.divide(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + # Avoid division by zero in input2 + input2 = np.where(input2 == 0, 1e-6, input2) + + _ = test_divide(ffconfig, input1, input2) diff --git a/examples/python/native/ops/dropout.py b/examples/python/native/ops/dropout.py new file mode 100644 index 0000000000..3aa44a5a5b --- /dev/null +++ b/examples/python/native/ops/dropout.py @@ -0,0 +1,49 @@ +# The basis for this test of the 'Dropout' layer is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_dropout(ffconfig, input_arr: np.ndarray, dropout_rate: float = 0.5) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply Dropout layer + out = ffmodel.dropout(input_tensor, dropout_rate, 0) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + # You can adjust the dropout rate as needed + dropout_rate_param = 0.5 + + result = test_dropout(ffconfig, input_data, dropout_rate_param) + + print("Input Data:") + print(input_data) + + print("\nResult after Dropout layer:") + print(result) diff --git a/examples/python/native/ops/elu.py b/examples/python/native/ops/elu.py new file mode 100644 index 0000000000..7a6ef1f621 --- /dev/null +++ b/examples/python/native/ops/elu.py @@ -0,0 +1,47 @@ +# The basis for this test of the 'ELU' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_elu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply ELU activation + out = ffmodel.elu(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_elu(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after ELU activation:") + print(result) diff --git a/examples/python/native/ops/embedding.py b/examples/python/native/ops/embedding.py new file mode 100644 index 0000000000..34bced3798 --- /dev/null +++ b/examples/python/native/ops/embedding.py @@ -0,0 +1,39 @@ +# The basis for this test of the 'embedding' layer is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_embedding(ffconfig, input_arr: np.ndarray, vocab_size: int, embedding_dim: int) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_INT32) + + out = ffmodel.embedding(input_tensor, vocab_size, embedding_dim, AggrMode.AGGR_MODE_SUM) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + vocab_size = 1000 + embedding_dim = 50 + input = np.random.randint(low=0, high=vocab_size, size=(ffconfig.batch_size, 10), dtype=np.int32) + _ = test_embedding(ffconfig, input, vocab_size, embedding_dim) diff --git a/examples/python/native/ops/exp.py b/examples/python/native/ops/exp.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/flat.py b/examples/python/native/ops/flat.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/gather.py b/examples/python/native/ops/gather.py new file mode 100644 index 0000000000..e13b6e4c75 --- /dev/null +++ b/examples/python/native/ops/gather.py @@ -0,0 +1,60 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_gather(ffconfig, input_arr: np.ndarray, index_arr: np.ndarray, dim: int, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + index_tensor = ffmodel.create_tensor(index_arr.shape, DataType.DT_INT32) + + gather_output = ffmodel.gather( + input_tensor, + index_tensor, + dim, + name="gather_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + dataloader_index = ffmodel.create_data_loader(index_tensor, index_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_index.reset() + + dataloader_input.next_batch(ffmodel) + dataloader_index.next_batch(ffmodel) + + ffmodel.forward() + + gather_output.inline_map(ffmodel, ffconfig) + output_result = gather_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + index_data = np.random.randint(0, 5, size=(ffconfig.batch_size,)).astype(np.int32) + dim_to_gather = 2 # Example dimension to gather along + + output_result = test_gather(ffconfig, input_data, index_data, dim=dim_to_gather) + + print("Input Array:") + print(input_data) + print("\nIndex Array:") + print(index_data) + print(f"\nOutput Array after applying gather along dimension {dim_to_gather}:") + print(output_result) diff --git a/examples/python/native/ops/gelu.py b/examples/python/native/ops/gelu.py new file mode 100644 index 0000000000..84fabd36e1 --- /dev/null +++ b/examples/python/native/ops/gelu.py @@ -0,0 +1,51 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_gelu(ffconfig, input_arr: np.ndarray, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + gelu_output = ffmodel.gelu( + input_tensor, + inplace=inplace, + name="gelu_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + gelu_output.inline_map(ffmodel, ffconfig) + output_result = gelu_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + inplace_flag = True # Example inplace flag + + output_result = test_gelu(ffconfig, input_data, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying gelu activation function (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/identity.py b/examples/python/native/ops/identity.py new file mode 100644 index 0000000000..fbf63e717c --- /dev/null +++ b/examples/python/native/ops/identity.py @@ -0,0 +1,49 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_identity(ffconfig, input_arr: np.ndarray, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + identity_output = ffmodel.identity( + input_tensor, + name="identity_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + identity_output.inline_map(ffmodel, ffconfig) + output_result = identity_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + output_result = test_identity(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying identity function:") + print(output_result) diff --git a/examples/python/native/ops/inc_multihead_self_attention.py b/examples/python/native/ops/inc_multihead_self_attention.py new file mode 100644 index 0000000000..dce7bd565d --- /dev/null +++ b/examples/python/native/ops/inc_multihead_self_attention.py @@ -0,0 +1,103 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multihead_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multihead_self_attention_output = ffmodel.inc_multihead_self_attention( + input_tensor, + embed_dim, + num_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multihead_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = inc_multihead_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_heads_value = 8 + + output_result = test_inc_multihead_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_heads=num_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multihead_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/inc_multihead_self_attention_verify.py b/examples/python/native/ops/inc_multihead_self_attention_verify.py new file mode 100644 index 0000000000..f6dc8e3933 --- /dev/null +++ b/examples/python/native/ops/inc_multihead_self_attention_verify.py @@ -0,0 +1,103 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multihead_self_attention_verify( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multihead_self_attention_verify_output = ffmodel.inc_multihead_self_attention_verify( + input_tensor, + embed_dim, + num_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multihead_self_attention_verify_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multihead_self_attention_verify_output.inline_map(ffmodel, ffconfig) + output_result = inc_multihead_self_attention_verify_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_heads_value = 8 + + output_result = test_inc_multihead_self_attention_verify( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_heads=num_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multihead_self_attention_verify:") + print(output_result) diff --git a/examples/python/native/ops/inc_multiquery_self_attention.py b/examples/python/native/ops/inc_multiquery_self_attention.py new file mode 100644 index 0000000000..33390ab1f6 --- /dev/null +++ b/examples/python/native/ops/inc_multiquery_self_attention.py @@ -0,0 +1,107 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multiquery_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_q_heads: int, + num_kv_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multiquery_self_attention_output = ffmodel.inc_multiquery_self_attention( + input_tensor, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multiquery_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_q_heads_value = 4 + num_kv_heads_value = 4 + + output_result = test_inc_multiquery_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_q_heads=num_q_heads_value, + num_kv_heads=num_kv_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multiquery_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/inc_multiquery_self_attention_verify.py b/examples/python/native/ops/inc_multiquery_self_attention_verify.py new file mode 100644 index 0000000000..69a76f68bf --- /dev/null +++ b/examples/python/native/ops/inc_multiquery_self_attention_verify.py @@ -0,0 +1,107 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_inc_multiquery_self_attention_verify( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_q_heads: int, + num_kv_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + inc_multiquery_self_attention_verify_output = ffmodel.inc_multiquery_self_attention_verify( + input_tensor, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="inc_multiquery_self_attention_verify_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + inc_multiquery_self_attention_verify_output.inline_map(ffmodel, ffconfig) + output_result = inc_multiquery_self_attention_verify_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_q_heads_value = 4 + num_kv_heads_value = 4 + + output_result = test_inc_multiquery_self_attention_verify( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_q_heads=num_q_heads_value, + num_kv_heads=num_kv_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying inc_multiquery_self_attention_verify:") + print(output_result) diff --git a/examples/python/native/ops/layer_norm.py b/examples/python/native/ops/layer_norm.py new file mode 100644 index 0000000000..b3cca93d6e --- /dev/null +++ b/examples/python/native/ops/layer_norm.py @@ -0,0 +1,48 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_layer_norm(ffconfig, input_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + layer_norm_output = ffmodel.layer_norm(input_tensor, axes=axes, elementwise_affine=elementwise_affine, eps=eps, use_bias=use_bias, name="layer_norm_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + layer_norm_output.inline_map(ffmodel, ffconfig) + layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig) + + return layer_norm_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + axes_to_normalize = [1, 2] # Example axes to normalize + + layer_norm_result = test_layer_norm(ffconfig, input_data, axes=axes_to_normalize, elementwise_affine=True, eps=1e-5, use_bias=True) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying layer_norm function along axes {axes_to_normalize}:") + print(layer_norm_result) diff --git a/examples/python/native/ops/max.py b/examples/python/native/ops/max.py new file mode 100644 index 0000000000..bf9c629406 --- /dev/null +++ b/examples/python/native/ops/max.py @@ -0,0 +1,54 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_max(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + max_output = ffmodel.max(input_tensor1, input_tensor2, name="max_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input2.reset() + + dataloader_input1.next_batch(ffmodel) + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + max_output.inline_map(ffmodel, ffconfig) + max_result = max_output.get_array(ffmodel, ffconfig) + + return max_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + max_result = test_max(ffconfig, input_data1, input_data2) + + print("Input Array 1:") + print(input_data1) + print("\nInput Array 2:") + print(input_data2) + print("\nOutput Array after applying max function:") + print(max_result) diff --git a/examples/python/native/ops/mean.py b/examples/python/native/ops/mean.py new file mode 100644 index 0000000000..df8c3f642e --- /dev/null +++ b/examples/python/native/ops/mean.py @@ -0,0 +1,48 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_mean(ffconfig, input_arr: np.ndarray, dims: List[int], keepdims: bool = False) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + mean_output = ffmodel.mean(input_tensor, dims=dims, keepdims=keepdims, name="mean_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + mean_output.inline_map(ffmodel, ffconfig) + mean_result = mean_output.get_array(ffmodel, ffconfig) + + return mean_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + dims_to_mean = [1, 2] # Example dimensions to take the mean over + + mean_result = test_mean(ffconfig, input_data, dims=dims_to_mean, keepdims=False) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying mean function along dimensions {dims_to_mean}:") + print(mean_result) diff --git a/examples/python/native/ops/min.py b/examples/python/native/ops/min.py new file mode 100644 index 0000000000..df81f4f2d2 --- /dev/null +++ b/examples/python/native/ops/min.py @@ -0,0 +1,54 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_min(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + min_output = ffmodel.min(input_tensor1, input_tensor2, name="min_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input2.reset() + + dataloader_input1.next_batch(ffmodel) + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + min_output.inline_map(ffmodel, ffconfig) + min_result = min_output.get_array(ffmodel, ffconfig) + + return min_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + min_result = test_min(ffconfig, input_data1, input_data2) + + print("Input Array 1:") + print(input_data1) + print("\nInput Array 2:") + print(input_data2) + print("\nOutput Array after applying min function:") + print(min_result) diff --git a/examples/python/native/ops/multihead_attention.py b/examples/python/native/ops/multihead_attention.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/python/native/ops/multiply.py b/examples/python/native/ops/multiply.py new file mode 100644 index 0000000000..fb4f489150 --- /dev/null +++ b/examples/python/native/ops/multiply.py @@ -0,0 +1,45 @@ +# The basis for this test of the 'multiply' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_multiply(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.multiply(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + _ = test_multiply(ffconfig, input1, input2) diff --git a/examples/python/native/ops/pool2d.py b/examples/python/native/ops/pool2d.py new file mode 100644 index 0000000000..b4dc8b219e --- /dev/null +++ b/examples/python/native/ops/pool2d.py @@ -0,0 +1,36 @@ +# AI generated from conv2d example +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_pool2d(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.pool2d(input_tensor, 3, 3, 1, 1, 0, 0, PoolType.POOL_MAX) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_pool2d(ffconfig, input) \ No newline at end of file diff --git a/examples/python/native/ops/pow.py b/examples/python/native/ops/pow.py new file mode 100644 index 0000000000..cf5bbebd80 --- /dev/null +++ b/examples/python/native/ops/pow.py @@ -0,0 +1,46 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_pow(ffconfig, input_arr: np.ndarray, exponent: float) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + pow_output = ffmodel.pow(input_tensor, exponent, name="pow_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + pow_output.inline_map(ffmodel, ffconfig) + pow_result = pow_output.get_array(ffmodel, ffconfig) + + return pow_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + exponent_value = 2.0 # Example exponent value + + pow_result = test_pow(ffconfig, input_data, exponent=exponent_value) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying pow function with exponent {exponent_value}:") + print(pow_result) diff --git a/examples/python/native/ops/reduce_sum.py b/examples/python/native/ops/reduce_sum.py new file mode 100644 index 0000000000..7e7b41b799 --- /dev/null +++ b/examples/python/native/ops/reduce_sum.py @@ -0,0 +1,48 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_reduce_sum(ffconfig, input_arr: np.ndarray, axes: List[int], keepdims: bool = False) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + reduce_sum_output = ffmodel.reduce_sum(input_tensor, axes=axes, keepdims=keepdims, name="reduce_sum_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + reduce_sum_output.inline_map(ffmodel, ffconfig) + reduce_sum_result = reduce_sum_output.get_array(ffmodel, ffconfig) + + return reduce_sum_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + axes_to_reduce = [1, 2] # Example axes to reduce + + reduce_sum_result = test_reduce_sum(ffconfig, input_data, axes=axes_to_reduce, keepdims=False) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying reduce_sum along axes {axes_to_reduce}:") + print(reduce_sum_result) diff --git a/examples/python/native/ops/relu.py b/examples/python/native/ops/relu.py new file mode 100644 index 0000000000..d855b27164 --- /dev/null +++ b/examples/python/native/ops/relu.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'ReLU' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_relu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply ReLU activation + out = ffmodel.relu(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_relu(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after ReLU activation:") + print(result) diff --git a/examples/python/native/ops/reshape.py b/examples/python/native/ops/reshape.py new file mode 100644 index 0000000000..348d6bd935 --- /dev/null +++ b/examples/python/native/ops/reshape.py @@ -0,0 +1,41 @@ +# The basis for this test of the 'reshape' operation is generated by ChatGPT using the manually created conv2d.py as a template. + +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_reshape(ffconfig, input_arr: np.ndarray, target_shape: List[int]) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.reshape(input_tensor, target_shape) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + target_shape = [ffconfig.batch_size, 500] + + _ = test_reshape(ffconfig, input, target_shape) diff --git a/examples/python/native/ops/residual_layer_norm.py b/examples/python/native/ops/residual_layer_norm.py new file mode 100644 index 0000000000..e12f2e53d9 --- /dev/null +++ b/examples/python/native/ops/residual_layer_norm.py @@ -0,0 +1,93 @@ +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual1_arr: np.ndarray, residual2_arr: np.ndarray, use_two_residuals: bool, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + residual1_tensor = ffmodel.create_tensor(residual1_arr.shape, DataType.DT_FLOAT) + residual2_tensor = ffmodel.create_tensor(residual2_arr.shape, DataType.DT_FLOAT) + + output_tensor, layer_norm_output = ffmodel.residual_layer_norm( + input_tensor, + residual1_tensor, + residual2_tensor if use_two_residuals else None, + use_two_residuals, + axes=axes, + elementwise_affine=elementwise_affine, + eps=eps, + use_bias=use_bias, + name="residual_layer_norm_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + dataloader_residual1 = ffmodel.create_data_loader(residual1_tensor, residual1_arr) + dataloader_residual2 = ffmodel.create_data_loader(residual2_tensor, residual2_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_residual1.reset() + if use_two_residuals: + dataloader_residual2.reset() + + dataloader_input.next_batch(ffmodel) + dataloader_residual1.next_batch(ffmodel) + if use_two_residuals: + dataloader_residual2.next_batch(ffmodel) + + ffmodel.forward() + + output_tensor.inline_map(ffmodel, ffconfig) + layer_norm_output.inline_map(ffmodel, ffconfig) + output_result = output_tensor.get_array(ffmodel, ffconfig) + layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig) + + return output_result, layer_norm_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + residual1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + residual2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + use_two_residuals_flag = True # Example flag + + axes_to_normalize = [1, 2] # Example axes to normalize + + output_result, layer_norm_result = test_residual_layer_norm( + ffconfig, + input_data, + residual1_data, + residual2_data, + use_two_residuals_flag, + axes=axes_to_normalize, + elementwise_affine=True, + eps=1e-5, + use_bias=True + ) + + print("Input Array:") + print(input_data) + print("\nResidual1 Array:") + print(residual1_data) + if use_two_residuals_flag: + print("\nResidual2 Array:") + print(residual2_data) + print(f"\nOutput Array after applying residual_layer_norm along axes {axes_to_normalize} with use_two_residuals={use_two_residuals_flag}:") + print(output_result) + print("\nLayer Norm Result:") + print(layer_norm_result) diff --git a/examples/python/native/ops/residual_rms_norm.py b/examples/python/native/ops/residual_rms_norm.py new file mode 100644 index 0000000000..9027dffada --- /dev/null +++ b/examples/python/native/ops/residual_rms_norm.py @@ -0,0 +1,80 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_residual_rms_norm( + ffconfig, + input1_arr: np.ndarray, + input2_arr: np.ndarray, + eps: float, + dim: int, + name=None, +): + ffmodel = FFModel(ffconfig) + + input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT) + input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT) + + residual_rms_norm_output1, residual_rms_norm_output2 = ffmodel.residual_rms_norm( + input1_tensor, + input2_tensor, + eps, + dim, + name="residual_rms_norm_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr) + dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + residual_rms_norm_output1.inline_map(ffmodel, ffconfig) + output_result1 = residual_rms_norm_output1.get_array(ffmodel, ffconfig) + + residual_rms_norm_output2.inline_map(ffmodel, ffconfig) + output_result2 = residual_rms_norm_output2.get_array(ffmodel, ffconfig) + + return output_result1, output_result2 + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + input2_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + eps_value = 1e-6 + dim_value = 1 # Example value for dim + + output_result1, output_result2 = test_residual_rms_norm( + ffconfig, + input1_data, + input2_data, + eps=eps_value, + dim=dim_value, + ) + + print("Input Array 1:") + print(input1_data) + print("\nInput Array 2:") + print(input2_data) + print("\nOutput Array 1 after applying residual_rms_norm:") + print(output_result1) + print("\nOutput Array 2 after applying residual_rms_norm:") + print(output_result2) diff --git a/examples/python/native/ops/reverse.py b/examples/python/native/ops/reverse.py new file mode 100644 index 0000000000..25394d4b9a --- /dev/null +++ b/examples/python/native/ops/reverse.py @@ -0,0 +1,37 @@ +# The basis for this test of the 'reverse' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_reverse(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.reverse(input_tensor, axis=2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_reverse(ffconfig, input) diff --git a/examples/python/native/ops/rms_norm.py b/examples/python/native/ops/rms_norm.py new file mode 100644 index 0000000000..3983d7f891 --- /dev/null +++ b/examples/python/native/ops/rms_norm.py @@ -0,0 +1,64 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_rms_norm( + ffconfig, + input_arr: np.ndarray, + eps: float, + dim: int, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + rms_norm_output = ffmodel.rms_norm( + input_tensor, + eps, + dim, + name="rms_norm_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY], + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + rms_norm_output.inline_map(ffmodel, ffconfig) + output_result = rms_norm_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + eps_value = 1e-6 + dim_value = 1 # Example value for dim + + output_result = test_rms_norm( + ffconfig, + input_data, + eps=eps_value, + dim=dim_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying rms_norm:") + print(output_result) diff --git a/examples/python/native/ops/rsqrt.py b/examples/python/native/ops/rsqrt.py new file mode 100644 index 0000000000..3d9ab65449 --- /dev/null +++ b/examples/python/native/ops/rsqrt.py @@ -0,0 +1,44 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_rsqrt(ffconfig, input_arr: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + rsqrt_output = ffmodel.rsqrt(input_tensor, name="rsqrt_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + rsqrt_output.inline_map(ffmodel, ffconfig) + rsqrt_result = rsqrt_output.get_array(ffmodel, ffconfig) + + return rsqrt_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + rsqrt_result = test_rsqrt(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying rsqrt function:") + print(rsqrt_result) diff --git a/examples/python/native/ops/sampling.py b/examples/python/native/ops/sampling.py new file mode 100644 index 0000000000..2219f09eff --- /dev/null +++ b/examples/python/native/ops/sampling.py @@ -0,0 +1,55 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_sampling(ffconfig, input_arr: np.ndarray, top_p: float, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + sampling_output = ffmodel.sampling( + input_tensor, + top_p, + name="sampling_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_MEAN_SQUARED_ERROR, + metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR], + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + sampling_output.inline_map(ffmodel, ffconfig) + output_result = sampling_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32) + top_p_value = 0.8 + + output_result = test_sampling( + ffconfig, + input_data, + top_p=top_p_value, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying sampling:") + print(output_result) diff --git a/examples/python/native/ops/scalar_add.py b/examples/python/native/ops/scalar_add.py new file mode 100644 index 0000000000..48a316ea8a --- /dev/null +++ b/examples/python/native/ops/scalar_add.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_add(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_add_output = ffmodel.scalar_add( + input_tensor, + scalar, + inplace=inplace, + name="scalar_add_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_add_output.inline_map(ffmodel, ffconfig) + output_result = scalar_add_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_add(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar addition with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/scalar_multiply.py b/examples/python/native/ops/scalar_multiply.py new file mode 100644 index 0000000000..ebae5cce01 --- /dev/null +++ b/examples/python/native/ops/scalar_multiply.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_multiply(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_multiply_output = ffmodel.scalar_multiply( + input_tensor, + scalar, + inplace=inplace, + name="scalar_multiply_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_multiply_output.inline_map(ffmodel, ffconfig) + output_result = scalar_multiply_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_multiply(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar multiplication with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/scalar_sub.py b/examples/python/native/ops/scalar_sub.py new file mode 100644 index 0000000000..2dc467b573 --- /dev/null +++ b/examples/python/native/ops/scalar_sub.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_sub(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_sub_output = ffmodel.scalar_sub( + input_tensor, + scalar, + inplace=inplace, + name="scalar_sub_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_sub_output.inline_map(ffmodel, ffconfig) + output_result = scalar_sub_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_sub(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar subtraction with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/scalar_true_divide.py b/examples/python/native/ops/scalar_true_divide.py new file mode 100644 index 0000000000..f1b64df506 --- /dev/null +++ b/examples/python/native/ops/scalar_true_divide.py @@ -0,0 +1,53 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_scalar_true_divide(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + scalar_true_divide_output = ffmodel.scalar_true_divide( + input_tensor, + scalar, + inplace=inplace, + name="scalar_true_divide_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + scalar_true_divide_output.inline_map(ffmodel, ffconfig) + output_result = scalar_true_divide_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + scalar_value = 2.0 # Example scalar value + inplace_flag = True # Example inplace flag + + output_result = test_scalar_true_divide(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag) + + print("Input Array:") + print(input_data) + print(f"\nOutput Array after applying scalar true division with scalar value {scalar_value} (inplace={inplace_flag}):") + print(output_result) diff --git a/examples/python/native/ops/sigmoid.py b/examples/python/native/ops/sigmoid.py new file mode 100644 index 0000000000..0fbe21df45 --- /dev/null +++ b/examples/python/native/ops/sigmoid.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'Sigmoid' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_sigmoid(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply Sigmoid activation + out = ffmodel.sigmoid(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_sigmoid(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after Sigmoid activation:") + print(result) diff --git a/examples/python/native/ops/sigmoid_silu_multi.py b/examples/python/native/ops/sigmoid_silu_multi.py new file mode 100644 index 0000000000..cecc3e102e --- /dev/null +++ b/examples/python/native/ops/sigmoid_silu_multi.py @@ -0,0 +1,58 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_sigmoid_silu_multi(ffconfig, input1_arr: np.ndarray, input2_arr: np.ndarray, name=None): + ffmodel = FFModel(ffconfig) + + input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT) + input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT) + + sigmoid_silu_multi_output = ffmodel.sigmoid_silu_multi( + input1_tensor, + input2_tensor, + name="sigmoid_silu_multi_layer" + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr) + dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input2.reset() + + dataloader_input1.next_batch(ffmodel) + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + sigmoid_silu_multi_output.inline_map(ffmodel, ffconfig) + output_result = sigmoid_silu_multi_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + output_result = test_sigmoid_silu_multi(ffconfig, input1_data, input2_data) + + print("Input1 Array:") + print(input1_data) + print("\nInput2 Array:") + print(input2_data) + print("\nOutput Array after applying sigmoid_silu_multi:") + print(output_result) diff --git a/examples/python/native/ops/sin.py b/examples/python/native/ops/sin.py new file mode 100644 index 0000000000..4b60a4e1d4 --- /dev/null +++ b/examples/python/native/ops/sin.py @@ -0,0 +1,44 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_sin(ffconfig, input_arr: np.ndarray) -> np.ndarray: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + sin_output = ffmodel.sin(input_tensor, name="sin_layer") + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + sin_output.inline_map(ffmodel, ffconfig) + sin_result = sin_output.get_array(ffmodel, ffconfig) + + return sin_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + sin_result = test_sin(ffconfig, input_data) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying sin function:") + print(sin_result) diff --git a/examples/python/native/ops/softmax.py b/examples/python/native/ops/softmax.py new file mode 100644 index 0000000000..b5481bcc80 --- /dev/null +++ b/examples/python/native/ops/softmax.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'Softmax' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_softmax(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply Softmax activation + out = ffmodel.softmax(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10).astype(np.float32) + + result = test_softmax(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after Softmax activation:") + print(result) diff --git a/examples/python/native/ops/spec_inc_multihead_self_attention.py b/examples/python/native/ops/spec_inc_multihead_self_attention.py new file mode 100644 index 0000000000..bd1aaa189b --- /dev/null +++ b/examples/python/native/ops/spec_inc_multihead_self_attention.py @@ -0,0 +1,103 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_spec_inc_multihead_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + spec_inc_multihead_self_attention_output = ffmodel.spec_inc_multihead_self_attention( + input_tensor, + embed_dim, + num_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="spec_inc_multihead_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + spec_inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = spec_inc_multihead_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_heads_value = 8 + + output_result = test_spec_inc_multihead_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_heads=num_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying spec_inc_multihead_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/spec_inc_multiquery_self_attention.py b/examples/python/native/ops/spec_inc_multiquery_self_attention.py new file mode 100644 index 0000000000..0b731c99e0 --- /dev/null +++ b/examples/python/native/ops/spec_inc_multiquery_self_attention.py @@ -0,0 +1,107 @@ +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_spec_inc_multiquery_self_attention( + ffconfig, + input_arr: np.ndarray, + embed_dim: int, + num_q_heads: int, + num_kv_heads: int, + kdim: int = 0, + vdim: int = 0, + dropout: float = 0.0, + bias: bool = True, + add_bias_kv: bool = False, + add_zero_attn: bool = False, + data_type: DataType = DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding: bool = False, + scaling_query: bool = False, + scaling_factor: float = 1.0, + qk_prod_scaling: bool = True, + position_bias: bool = False, + name=None, +): + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, data_type) + + spec_inc_multiquery_self_attention_output = ffmodel.spec_inc_multiquery_self_attention( + input_tensor, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=kdim, + vdim=vdim, + dropout=dropout, + bias=bias, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + data_type=data_type, + kernel_initializer=kernel_initializer, + apply_rotary_embedding=apply_rotary_embedding, + scaling_query=scaling_query, + scaling_factor=scaling_factor, + qk_prod_scaling=qk_prod_scaling, + position_bias=position_bias, + name="spec_inc_multiquery_self_attention_layer", + ) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY] + ) + + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + spec_inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig) + output_result = spec_inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig) + + return output_result + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32) + embed_dim_value = 64 + num_q_heads_value = 4 + num_kv_heads_value = 4 + + output_result = test_spec_inc_multiquery_self_attention( + ffconfig, + input_data, + embed_dim=embed_dim_value, + num_q_heads=num_q_heads_value, + num_kv_heads=num_kv_heads_value, + kdim=0, # Example value for kdim + vdim=0, # Example value for vdim + dropout=0.1, # Example value for dropout + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_FLOAT, + kernel_initializer=None, # Example value for kernel_initializer + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + ) + + print("Input Array:") + print(input_data) + print("\nOutput Array after applying spec_inc_multiquery_self_attention:") + print(output_result) diff --git a/examples/python/native/ops/split.py b/examples/python/native/ops/split.py new file mode 100644 index 0000000000..d03a52a769 --- /dev/null +++ b/examples/python/native/ops/split.py @@ -0,0 +1,47 @@ +# The basis for this test of the 'split' operation is generated by ChatGPT using the manually created conv2d.py as a template. + +from typing import List + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_split(ffconfig, input_arr: np.ndarray) -> List[flexflow.core.Tensor]: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out1, out2 = ffmodel.split(input_tensor, 2, axis=1) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out1.inline_map(ffmodel, ffconfig) + out2.inline_map(ffmodel, ffconfig) + + return [out1.get_array(ffmodel, ffconfig), out2.get_array(ffmodel, ffconfig)] + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 10, 10, 10).astype(np.float32) + output_list = test_split(ffconfig, input) + + print("Output Tensor 1:") + print(output_list[0]) + + print("\nOutput Tensor 2:") + print(output_list[1]) diff --git a/examples/python/native/ops/subtract.py b/examples/python/native/ops/subtract.py new file mode 100644 index 0000000000..5f829cbae1 --- /dev/null +++ b/examples/python/native/ops/subtract.py @@ -0,0 +1,45 @@ +# The basis for this test of the 'subtract' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_subtract(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT) + input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT) + + out = ffmodel.subtract(input_tensor1, input_tensor2) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1) + dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2) + + ffmodel.init_layers() + + dataloader_input1.reset() + dataloader_input1.next_batch(ffmodel) + + dataloader_input2.reset() + dataloader_input2.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + _ = test_subtract(ffconfig, input1, input2) diff --git a/examples/python/native/ops/tanh.py b/examples/python/native/ops/tanh.py new file mode 100644 index 0000000000..ba4ba7d6ff --- /dev/null +++ b/examples/python/native/ops/tanh.py @@ -0,0 +1,46 @@ +# The basis for this test of the 'tanh' activation function is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + +def test_tanh(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + # Apply tanh activation + out = ffmodel.tanh(input_tensor) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + + result = test_tanh(ffconfig, input_data) + + print("Input Data:") + print(input_data) + + print("\nResult after tanh activation:") + print(result) diff --git a/examples/python/native/ops/transpose.py b/examples/python/native/ops/transpose.py new file mode 100644 index 0000000000..6f514d660c --- /dev/null +++ b/examples/python/native/ops/transpose.py @@ -0,0 +1,38 @@ +# The basis for this test of the 'transpose' operation is generated by ChatGPT using the manually created conv2d.py as a template. + + +import flexflow.core +import numpy as np +from flexflow.core import * + + +def test_transpose(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor: + ffmodel = FFModel(ffconfig) + + input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT) + + out = ffmodel.transpose(input_tensor, [ffconfig.batch_size, 10, 5, 10]) + + ffoptimizer = SGDOptimizer(ffmodel, 0.001) + ffmodel.optimizer = ffoptimizer + ffmodel.compile( + loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]) + dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr) + + ffmodel.init_layers() + + dataloader_input.reset() + dataloader_input.next_batch(ffmodel) + ffmodel.forward() + + out.inline_map(ffmodel, ffconfig) + return out.get_array(ffmodel, ffconfig) + + +if __name__ == '__main__': + init_flexflow_runtime() + ffconfig = FFConfig() + + input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32) + _ = test_transpose(ffconfig, input) From a83effedd6e0185a7e8225f445c0aaba840c1aca Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 20 Jun 2024 04:08:29 +0000 Subject: [PATCH 10/44] add code to keep runners registered --- .github/workflows/docker-build.yml | 41 ++++++++++++++++++++---------- .github/workflows/gpu-ci.yml | 24 +++++++++++++++++ 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index d16179434b..eeaab0e0af 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -9,9 +9,9 @@ on: branches: - "inference" - "master" - # schedule: - # # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated - # - cron: "0 8 * * 0" + schedule: + # At 00:00 on day-of-month 1, 14, and 28. + - cron: "0 0 1,14,28 * *" workflow_dispatch: # Cancel outdated workflows if they are still running @@ -58,13 +58,28 @@ jobs: - name: Check availability of flexflow modules in Python run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - + + keep-runner-registered: + name: Keep runner alive + if: ${{ github.event_name == 'schedule' }} + runs-on: [self-hosted, rocm_builder] + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" + needs: rocm-builder-start + steps: + - name: Keep alive + run: | + echo "Keep self-hosted runner registered with Github" + sleep 10m docker-build-and-publish-rocm: name: Build and Deploy FlexFlow Docker Containers (ROCm backend) needs: rocm-builder-start runs-on: [self-hosted, rocm_builder] - if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} strategy: matrix: hip_version: ["5.3", "5.4", "5.5", "5.6"] @@ -106,19 +121,19 @@ jobs: cuda_version: ${{ matrix.cuda_version }} steps: - name: Checkout Git Repository - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} uses: actions/checkout@v3 with: submodules: recursive - name: Free additional space on runner - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} run: .github/workflows/helpers/free_space_on_runner.sh - name: Build Docker container - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} env: - deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} build_needed: ${{ matrix.cuda_version == '12.0' }} run: | # On push to inference, build for all compatible architectures, so that we can publish @@ -133,11 +148,11 @@ jobs: fi - name: Check availability of flexflow modules in Python - if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} + if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" - name: Publish Docker environment image (on push to inference) - if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} + if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} env: FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} run: | @@ -145,7 +160,7 @@ jobs: ./docker/publish.sh flexflow rocm-builder-stop: - needs: docker-build-and-publish-rocm + needs: [docker-build-and-publish-rocm, keep-runner-registered] if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} runs-on: ubuntu-latest name: Stop the AWS instance we used to build the ROCM Docker images @@ -166,7 +181,7 @@ jobs: name: Notify Slack in case of failure runs-on: ubuntu-20.04 needs: [docker-build-cuda, docker-build-and-publish-rocm] - if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }} + if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }} steps: - name: Send Slack message env: diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 7bdb6805a8..c7d0cd72cb 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -1,5 +1,7 @@ name: "gpu-ci" on: + schedule: + - cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28. push: branches: - "inference" @@ -43,8 +45,28 @@ jobs: pip3 install pygithub python3 .github/workflows/helpers/gpu_ci_helper.py + keep-runner-registered: + name: Keep runner alive + if: ${{ github.event_name == 'schedule' }} + runs-on: [self-hosted, gpu] + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" + needs: gpu-ci-concierge + container: + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + options: --gpus all --shm-size=8192m + steps: + - name: Keep alive + run: | + echo "Keep self-hosted runner registered with Github" + sleep 10m + python-interface-check: name: Check Python Interface + if: ${{ github.event_name != 'schedule' }} runs-on: [self-hosted, gpu] defaults: run: @@ -119,6 +141,7 @@ jobs: inference-tests: name: Inference Tests + if: ${{ github.event_name != 'schedule' }} runs-on: [self-hosted, gpu] defaults: run: @@ -195,6 +218,7 @@ jobs: training-tests: name: Training Tests + if: ${{ github.event_name != 'schedule' }} runs-on: [self-hosted, gpu] # skip this time-consuming test for PRs to the inference branch # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} From 4f82aaed6317cef0a2587848a3b6d57f1d709381 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 10 Jul 2024 23:15:28 -0400 Subject: [PATCH 11/44] fix docker --- docker/flexflow-environment/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 6ca337f58d..cef619ad68 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -37,6 +37,7 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ chmod +x ~/${MINICONDA_SCRIPT_NAME} && \ bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \ rm ~/${MINICONDA_SCRIPT_NAME} && \ + /opt/conda/bin/conda config --set solver classic && \ /opt/conda/bin/conda upgrade --all && \ /opt/conda/bin/conda install conda-build conda-verify && \ /opt/conda/bin/conda clean -ya From 25fb40772f587892510bfe0ca296ae54768ff35c Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 11 Jul 2024 15:16:40 -0400 Subject: [PATCH 12/44] [Tokenizer] update tokenizers-cpp repo --- deps/tokenizers-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/tokenizers-cpp b/deps/tokenizers-cpp index 4f42c9fa74..c0fab1e14a 160000 --- a/deps/tokenizers-cpp +++ b/deps/tokenizers-cpp @@ -1 +1 @@ -Subproject commit 4f42c9fa74946d70af86671a3804b6f2433e5dac +Subproject commit c0fab1e14a9421c1501acee5b7703e5dafa60479 From 6a1a1886909fc864aadfb10823077f94fe03b72e Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 3 Aug 2024 08:31:37 -0700 Subject: [PATCH 13/44] minor bug fix (#1456) --- .../ops/kernels/inc_multihead_self_attention_kernels.h | 3 ++- src/ops/attention.cu | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 9bf2f581e2..26dcf12425 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -56,7 +56,8 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, int num_heads, int num_kv_heads, bool scaling_query, - float scaling_factor); + float scaling_factor, + int hidden_size); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) template diff --git a/src/ops/attention.cu b/src/ops/attention.cu index 9b8b90da70..18fc810aed 100644 --- a/src/ops/attention.cu +++ b/src/ops/attention.cu @@ -206,7 +206,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, checkCUDNN(cudnnCreateSeqDataDescriptor(&oDesc)); // Currently do not support adding bias to key/value projection assert(!attn->add_bias_kv); - cudnnAttnQueryMap_t attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; + unsigned attnMode = CUDNN_ATTN_QUERYMAP_ALL_TO_ONE; // Assume no beam search for now int maxBeamSize = 1; // printf("batchSize(%d) qSize(%d) kSize(%d) vSize(%d) qProjSize(%d) From 9784b5c6516bafe272fc6555daaa9b867a5eacfa Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Mon, 12 Aug 2024 11:02:49 -0700 Subject: [PATCH 14/44] update legion version (#1307) * update legion version * legion version update * update legion version --- CMakeLists.txt | 2 +- deps/legion | 2 +- examples/cpp/AlexNet/alexnet.cc | 2 +- examples/cpp/DLRM/dlrm.cc | 2 +- examples/cpp/InceptionV3/inception.cc | 2 +- examples/cpp/ResNet/resnet.cc | 2 +- examples/cpp/Transformer/transformer.cc | 2 +- examples/cpp/XDL/xdl.cc | 2 +- examples/cpp/candle_uno/candle_uno.cc | 2 +- examples/cpp/mixture_of_experts/moe.cc | 2 +- examples/cpp/resnext50/resnext.cc | 2 +- examples/cpp/split_test/split_test.cc | 2 +- examples/cpp/split_test_2/split_test_2.cc | 2 +- include/flexflow/graph.h | 2 +- include/flexflow/operator.h | 4 +++- include/flexflow/utils/recursive_logger.h | 4 ++-- inference/incr_decoding/incr_decoding.cc | 2 +- inference/spec_infer/spec_infer.cc | 2 +- src/mapper/mapper.cc | 7 ++++++- src/ops/beam_topk.cpp | 2 +- src/ops/beam_topk.cu | 2 +- src/ops/inc_multihead_self_attention.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cc | 2 +- src/runtime/batch_config.cc | 2 +- src/runtime/beam_search_batch_config.cc | 2 +- src/runtime/graph.cc | 4 ++-- src/runtime/inference_manager.cc | 4 ++-- src/runtime/model.cc | 6 ++++-- src/runtime/optimizer_kernel.cpp | 4 ++-- src/runtime/optimizer_kernel.cu | 2 +- src/runtime/request_manager.cc | 2 +- src/runtime/simulator.cc | 8 ++++---- src/runtime/substitution.cc | 4 ++-- src/runtime/tree_verify_batch_config.cc | 2 +- tests/ops/batch_matmul_test.cc | 2 +- tests/ops/concat_test.cc | 2 +- tests/ops/flat_test.cc | 2 +- tests/ops/linear_test.cc | 2 +- tests/ops/reshape_test.cc | 2 +- tests/ops/tanh_test.cc | 2 +- tests/ops/transpose_test.cc | 2 +- 41 files changed, 59 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ce4f7044..7079fdadb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -425,7 +425,7 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. diff --git a/deps/legion b/deps/legion index 24e8c45234..02eb1010ca 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 +Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0 diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc index 128496eab1..3507882329 100644 --- a/examples/cpp/AlexNet/alexnet.cc +++ b/examples/cpp/AlexNet/alexnet.cc @@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("AlexNet"); +Legion::Logger log_app("AlexNet"); void parse_input_args(char **argv, int argc, AlexNetConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc index 7dc49215b3..d7dc167557 100644 --- a/examples/cpp/DLRM/dlrm.cc +++ b/examples/cpp/DLRM/dlrm.cc @@ -19,7 +19,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("DLRM"); +Legion::Logger log_app("DLRM"); void parse_input_args(char **argv, int argc, DLRMConfig &apConfig); diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc index b2070cc52d..6d0fa7ee53 100644 --- a/examples/cpp/InceptionV3/inception.cc +++ b/examples/cpp/InceptionV3/inception.cc @@ -21,7 +21,7 @@ using namespace Legion; using namespace FlexFlow; -LegionRuntime::Logger::Category log_app("Inceptionv3"); +Legion::Logger log_app("Inceptionv3"); Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) { Tensor t1 = input; diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc index 455eb743ae..49ce934a6a 100644 --- a/examples/cpp/ResNet/resnet.cc +++ b/examples/cpp/ResNet/resnet.cc @@ -24,7 +24,7 @@ using FlexFlow::Optimizer; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("ResNet"); +Legion::Logger log_app("ResNet"); void parse_input_args(char **argv, int argc, ResNetConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc index d61a63cd03..b04093b0a9 100644 --- a/examples/cpp/Transformer/transformer.cc +++ b/examples/cpp/Transformer/transformer.cc @@ -17,7 +17,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("Transformer"); +Legion::Logger log_app("Transformer"); Tensor create_emb(FFModel *model, Tensor const &input, diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc index 2e6c3cec98..a2272f36e5 100644 --- a/examples/cpp/XDL/xdl.cc +++ b/examples/cpp/XDL/xdl.cc @@ -18,7 +18,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("XDL"); +Legion::Logger log_app("XDL"); void parse_input_args(char **argv, int argc, XDLConfig &apConfig); diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc index 779b8e9c14..e9f4bf876a 100644 --- a/examples/cpp/candle_uno/candle_uno.cc +++ b/examples/cpp/candle_uno/candle_uno.cc @@ -21,7 +21,7 @@ using namespace Legion; using namespace std; -LegionRuntime::Logger::Category log_app("Candle_Uno"); +Legion::Logger log_app("Candle_Uno"); void parse_input_args(char **argv, int argc, CandleConfig &apConfig); diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc index a707310885..a25f94abd9 100644 --- a/examples/cpp/mixture_of_experts/moe.cc +++ b/examples/cpp/mixture_of_experts/moe.cc @@ -20,7 +20,7 @@ using namespace Legion; -LegionRuntime::Logger::Category log_app("MoE"); +Legion::Logger log_app("MoE"); void parse_input_args(char **argv, int argc, MoeConfig &config) { for (int i = 1; i < argc; i++) { diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc index 3c28ca27b8..9b71b37cce 100644 --- a/examples/cpp/resnext50/resnext.cc +++ b/examples/cpp/resnext50/resnext.cc @@ -7,7 +7,7 @@ using FlexFlow::Optimizer; using FlexFlow::SGDOptimizer; using FlexFlow::Tensor; -LegionRuntime::Logger::Category log_app("resnext"); +Legion::Logger log_app("resnext"); Tensor resnext_block(FFModel &ff, Tensor input, diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc index 97b98c3214..ac9d516a59 100644 --- a/examples/cpp/split_test/split_test.cc +++ b/examples/cpp/split_test/split_test.cc @@ -3,7 +3,7 @@ using namespace Legion; using namespace FlexFlow; -LegionRuntime::Logger::Category log_app("split_test"); +Legion::Logger log_app("split_test"); void FlexFlow::top_level_task(Task const *task, std::vector const ®ions, diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc index 69385d14cb..fef078adbc 100644 --- a/examples/cpp/split_test_2/split_test_2.cc +++ b/examples/cpp/split_test_2/split_test_2.cc @@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph; using FlexFlow::PCG::GraphSearchHelper; using FlexFlow::PCG::Node; -LegionRuntime::Logger::Category log_app("split_test_2"); +Legion::Logger log_app("split_test_2"); void top_level_task(Task const *task, std::vector const ®ions, diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h index 2e0cf1ca4b..9dc6572593 100644 --- a/include/flexflow/graph.h +++ b/include/flexflow/graph.h @@ -24,7 +24,7 @@ #include "legion/legion_utilities.h" #include -extern LegionRuntime::Logger::Category log_dp; +extern Legion::Logger log_dp; namespace FlexFlow::PCG { diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1b19bdb82f..311699d926 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -19,7 +19,7 @@ namespace FlexFlow { -extern LegionRuntime::Logger::Category log_measure; +extern Legion::Logger log_measure; class OpMeta; class Simulator; @@ -233,6 +233,8 @@ class Op { std::vector const &, MachineView const *mv = nullptr) { assert(false); + Legion::FutureMap empty_map; + return empty_map; }; virtual void print_layer(FFModel const &model) = 0; template diff --git a/include/flexflow/utils/recursive_logger.h b/include/flexflow/utils/recursive_logger.h index 2c43b42309..d073f58f3e 100644 --- a/include/flexflow/utils/recursive_logger.h +++ b/include/flexflow/utils/recursive_logger.h @@ -26,7 +26,7 @@ class DepthTag { class RecursiveLogger { public: - /* RecursiveLogger(LegionRuntime::Logger::Category const &); */ + /* RecursiveLogger(Legion::Logger const &); */ RecursiveLogger(std::string const &category_name); Realm::LoggerMessage info(); @@ -42,7 +42,7 @@ class RecursiveLogger { void print_prefix(Realm::LoggerMessage &) const; - LegionRuntime::Logger::Category logger; + Legion::Logger logger; }; }; // namespace FlexFlow diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index aae7256ffe..ec3dda3158 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -28,7 +28,7 @@ using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; -LegionRuntime::Logger::Category log_app("llama"); +Legion::Logger log_app("llama"); struct FilePaths { std::string cache_folder_path; diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index f7edfd7696..60233ac8d1 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -26,7 +26,7 @@ using namespace FlexFlow; using namespace Legion; using json = nlohmann::json; -LegionRuntime::Logger::Category log_app("llama"); +Legion::Logger log_app("llama"); struct FilePaths { std::string cache_folder_path; diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index c293aecb19..4413d516ac 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -20,7 +20,7 @@ namespace FlexFlow { using namespace Legion; using namespace Mapping; -LegionRuntime::Logger::Category log_ff_mapper("Mapper"); +Legion::Logger log_ff_mapper("Mapper"); FFShardingFunctor::FFShardingFunctor(int _gpus_per_node, int _cpus_per_node, @@ -296,6 +296,7 @@ void FFMapper::select_task_options(const MapperContext ctx, // control replicate top level task if (enable_control_replication) { output.replicate = true; + output.map_locally = false; } return; } @@ -560,6 +561,10 @@ void FFMapper::map_task(const MapperContext ctx, assert(output.target_procs[i].address_space() == node_id); } } + if (input.shard_processor.exists()) { + output.target_procs = std::vector{input.shard_processor}; + } + // Find instances that still need to be mapped std::vector> missing_fields(task.regions.size()); runtime->filter_instances(ctx, diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 18534455a0..8545bea7cb 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -25,7 +25,7 @@ using Legion::coord_t; enum class HeapType { kMinHeap, kMaxHeap }; enum class PreferIndices { kLower, kHigher }; -LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); +Legion::Logger log_beam_topk("BeamTopK"); template struct Entry { diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index a958786be3..c24bdf7c74 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -25,7 +25,7 @@ using Legion::coord_t; enum class HeapType { kMinHeap, kMaxHeap }; enum class PreferIndices { kLower, kHigher }; -LegionRuntime::Logger::Category log_beam_topk("BeamTopK"); +Legion::Logger log_beam_topk("BeamTopK"); template struct Entry { diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 7aa3503770..8688585788 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -46,7 +46,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -LegionRuntime::Logger::Category log_inc_mha("IncrementalMHA"); +Legion::Logger log_inc_mha("IncrementalMHA"); bool IncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index d0efb01d54..9b8c88420d 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -46,7 +46,7 @@ using Legion::TaskArgument; using Legion::TaskLauncher; using PCG::Node; -LegionRuntime::Logger::Category log_tree_verify("TreeVerifyIncMHA"); +Legion::Logger log_tree_verify("TreeVerifyIncMHA"); bool TreeIncMultiHeadSelfAttentionParams::is_valid( ParallelTensorShape const &input) const { diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index bd96dbb141..7989b0799e 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -21,7 +21,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_bc("BatchConfig"); +Legion::Logger log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index ff7bf1a819..0509c23afe 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -24,7 +24,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_beam_bc("BeamSearchBatchConfig"); +Legion::Logger log_beam_bc("BeamSearchBatchConfig"); BeamSearchBatchConfig::BeamSearchBatchConfig() : BatchConfig() { this->beam_width = DEFAULT_BEAM_WIDTH; diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index f8e8240ccf..cf75235ae7 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -66,8 +66,8 @@ namespace FlexFlow::PCG { using namespace Legion; using FlexFlow::MachineView; -LegionRuntime::Logger::Category log_graph("graph"); -LegionRuntime::Logger::Category log_simplify("graph_simplify"); +Legion::Logger log_graph("graph"); +Legion::Logger log_simplify("graph_simplify"); const Node Node::INVALID_NODE = Node(); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 2a94df8b4d..3d299aeedd 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -25,8 +25,8 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); -LegionRuntime::Logger::Category log_offload("Offloading"); +Legion::Logger log_inf_mgr("InferenceManager"); +Legion::Logger log_offload("Offloading"); InferenceManager::InferenceManager() {} diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 1fa281777a..5cad628743 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -82,8 +82,8 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_model("Model"); -LegionRuntime::Logger::Category log_measure("measure"); +Legion::Logger log_model("Model"); +Legion::Logger log_measure("measure"); Op::Op(FFModel &model, OperatorType otype, @@ -6748,6 +6748,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(SGD_UPD_NCCL_TASK_ID, "SGD NCCL Update"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "SGD NCCL Update Task"); @@ -6898,6 +6899,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "NCCL Init Communicators"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "NCCL Init Communicators Task"); diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index e71adc87a8..59efaf5256 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -21,7 +21,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_optimizer("optimizer"); +Legion::Logger log_optimizer("optimizer"); __global__ void sgd_update(size_t count, float lr, @@ -247,4 +247,4 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, } #endif -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index 5f654fbb5b..df37e3b135 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -20,7 +20,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_optimizer("optimizer"); +Legion::Logger log_optimizer("optimizer"); __global__ void sgd_update(size_t count, float lr, diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 16513e918a..d21285eef2 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -29,7 +29,7 @@ namespace FlexFlow { using namespace Legion; using tokenizers::Tokenizer; -LegionRuntime::Logger::Category log_req_mgr("RequestManager"); +Legion::Logger log_req_mgr("RequestManager"); std::string LoadBytesFromFile(std::string const &path) { std::ifstream fs(path, std::ios::in | std::ios::binary); diff --git a/src/runtime/simulator.cc b/src/runtime/simulator.cc index d943376416..b71af0d47e 100644 --- a/src/runtime/simulator.cc +++ b/src/runtime/simulator.cc @@ -31,10 +31,10 @@ namespace FlexFlow { using namespace Legion; -LegionRuntime::Logger::Category log_sim("sim"); -LegionRuntime::Logger::Category log_ps_sim("ps_sim"); -LegionRuntime::Logger::Category log_xfer_sim("xfer_sim"); -LegionRuntime::Logger::Category log_xfer_est("xfer_est"); +Legion::Logger log_sim("sim"); +Legion::Logger log_ps_sim("ps_sim"); +Legion::Logger log_xfer_sim("xfer_sim"); +Legion::Logger log_xfer_est("xfer_est"); // template class std::map; // for debugging in gdb // template class std::map; // for debugging in gdb diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index c0804d6e19..b86964049d 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -54,8 +54,8 @@ namespace FlexFlow::PCG { using namespace Legion; -LegionRuntime::Logger::Category log_xfers("xfers"); -LegionRuntime::Logger::Category log_xfer_matches("xfer_matches"); +Legion::Logger log_xfers("xfers"); +Legion::Logger log_xfer_matches("xfer_matches"); const TensorX TensorX::NO_TX = TensorX(); diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 841c735f59..49d42bb6dd 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -21,7 +21,7 @@ namespace FlexFlow { -LegionRuntime::Logger::Category log_tree_bc("TreeVerifyBatchConfig"); +Legion::Logger log_tree_bc("TreeVerifyBatchConfig"); TreeVerifyBatchConfig::TreeVerifyBatchConfig() : BatchConfig() {} diff --git a/tests/ops/batch_matmul_test.cc b/tests/ops/batch_matmul_test.cc index 7931f44129..f61048febf 100644 --- a/tests/ops/batch_matmul_test.cc +++ b/tests/ops/batch_matmul_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("bmm_test"); +Legion::Logger log_app("bmm_test"); struct BMMTestMeta { int m, k, n, d; diff --git a/tests/ops/concat_test.cc b/tests/ops/concat_test.cc index c67b718e0e..b0489d1adb 100644 --- a/tests/ops/concat_test.cc +++ b/tests/ops/concat_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("concat_test"); +Legion::Logger log_app("concat_test"); struct ConcatTestMeta { int batch_size, i_dim, num_channels, projected_num_channels, diff --git a/tests/ops/flat_test.cc b/tests/ops/flat_test.cc index 428893a0dc..61de83b6b0 100644 --- a/tests/ops/flat_test.cc +++ b/tests/ops/flat_test.cc @@ -7,7 +7,7 @@ #include using namespace Legion; -LegionRuntime::Logger::Category log_app("Flat_test"); +Legion::Logger log_app("Flat_test"); struct FlatTestMeta { int i_dim, o_dim; diff --git a/tests/ops/linear_test.cc b/tests/ops/linear_test.cc index 5b65de3a56..7c84ad1078 100644 --- a/tests/ops/linear_test.cc +++ b/tests/ops/linear_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("linear_test"); +Legion::Logger log_app("linear_test"); struct LinearTestMeta { int batch_size, i_dim, num_channels, dense_projection_o_dim, diff --git a/tests/ops/reshape_test.cc b/tests/ops/reshape_test.cc index e8f4586b23..a8aa046a64 100644 --- a/tests/ops/reshape_test.cc +++ b/tests/ops/reshape_test.cc @@ -6,7 +6,7 @@ #include #define PRECISION 16 using namespace Legion; -LegionRuntime::Logger::Category log_app("Reshape_test"); +Legion::Logger log_app("Reshape_test"); struct ReshapeTestMeta { int i_dim, o_dim; diff --git a/tests/ops/tanh_test.cc b/tests/ops/tanh_test.cc index 1c24d96aaf..1e86934f86 100644 --- a/tests/ops/tanh_test.cc +++ b/tests/ops/tanh_test.cc @@ -6,7 +6,7 @@ #include #define PRECISION 16 using namespace Legion; -LegionRuntime::Logger::Category log_app("Tanh_test"); +Legion::Logger log_app("Tanh_test"); struct TanhTestMeta { int i_dim, o_dim; diff --git a/tests/ops/transpose_test.cc b/tests/ops/transpose_test.cc index 10481aa14f..045f28479c 100644 --- a/tests/ops/transpose_test.cc +++ b/tests/ops/transpose_test.cc @@ -5,7 +5,7 @@ #include #include using namespace Legion; -LegionRuntime::Logger::Category log_app("transpose_test"); +Legion::Logger log_app("transpose_test"); struct TransposeTestMeta { int m, k, d; From f747438f0927ec528d481cfd6b9c7f15465677c9 Mon Sep 17 00:00:00 2001 From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com> Date: Tue, 13 Aug 2024 10:49:54 -0400 Subject: [PATCH 15/44] Managed mem support (#1466) * feat: fix missed compile definition * feat: add func `get_proc_mem` to process memory allocation * chore: minor * chore: try to use get_proc_mem * fix: proc_mem allocation * feat: switch to use get_proc_mem * feat: update Realm::Logger definition * fix: now all memory are allocated by get_proc_mem * chore: minor * fix: no memory allocation bugs * chore: merge file * chore: don't use ManagedMemory for now --- CMakeLists.txt | 1 + include/flexflow/model.h | 1 + include/flexflow/ops/batch_norm.h | 1 + include/flexflow/utils/memory_allocator.h | 2 ++ src/mapper/mapper.cc | 7 ++----- src/ops/add_bias_residual_layer_norm.cc | 5 +---- src/ops/argmax.cc | 5 +---- src/ops/attention.cc | 5 +---- src/ops/batch_norm.cpp | 5 +---- src/ops/batch_norm.cu | 5 +---- src/ops/beam_topk.cc | 5 +---- src/ops/dropout.cc | 5 +---- src/ops/inc_multihead_self_attention.cc | 5 +---- src/ops/layer_norm.cc | 5 +---- src/ops/linear.cc | 5 +---- src/ops/residual_layer_norm.cc | 5 +---- src/ops/residual_rms_norm.cc | 5 +---- src/ops/rms_norm.cc | 5 +---- src/ops/sampling.cc | 5 +---- src/ops/sigmoid_silu_multi.cc | 5 +---- src/ops/spec_inc_multihead_self_attention.cc | 5 +---- src/ops/tree_inc_multihead_self_attention.cc | 5 +---- src/runtime/graph.cc | 5 +---- src/runtime/memory_allocator.cc | 12 ++++++++++++ src/runtime/model.cc | 4 ++-- src/runtime/model.cpp | 15 +++------------ src/runtime/model.cu | 15 +++------------ 27 files changed, 45 insertions(+), 103 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7079fdadb8..d7a6391e06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -496,6 +496,7 @@ if(NOT BUILD_LEGION_ONLY) if(NOT CARGO_RESULT EQUAL 0) message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") endif() + set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON) add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) target_link_libraries(flexflow tokenizers_cpp) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 95be9ab581..ea64f65a95 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -22,6 +22,7 @@ #include "flexflow/node.h" #include "flexflow/operator_params.h" #include "flexflow/utils/hash_utils.h" +#include "flexflow/utils/memory_allocator.h" #include "flexflow/utils/tuple.h" #include "initializer.h" #include "layer.h" diff --git a/include/flexflow/ops/batch_norm.h b/include/flexflow/ops/batch_norm.h index c923dc1097..01cc0e16ec 100644 --- a/include/flexflow/ops/batch_norm.h +++ b/include/flexflow/ops/batch_norm.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_BATCH_NORM_H #include "flexflow/model.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h index 8e50a4c3b3..7091b159b2 100644 --- a/include/flexflow/utils/memory_allocator.h +++ b/include/flexflow/utils/memory_allocator.h @@ -62,6 +62,8 @@ class MemoryAllocator { size_t instance_total_size, instance_allocated_size; }; +Legion::Memory get_proc_mem(Legion::Machine machine, Legion::Processor proc); + }; // namespace FlexFlow #endif // _FLEXFLOW_RUNTIME_H_ diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index 4413d516ac..d7b9a5e99d 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -14,6 +14,7 @@ */ #include "flexflow/mapper.h" +#include "flexflow/utils/memory_allocator.h" namespace FlexFlow { @@ -81,11 +82,7 @@ FFMapper::FFMapper(MapperRuntime *rt, if (it->address_space() == node_id) { local_gpus.push_back(*it); } - Machine::MemoryQuery fb_query(machine); - fb_query.only_kind(Memory::GPU_FB_MEM); - fb_query.best_affinity_to(*it); - assert(fb_query.count() == 1); - proc_fbmems[*it] = *(fb_query.begin()); + proc_fbmems[*it] = get_proc_mem(machine, *it); Machine::MemoryQuery zc_query(machine); zc_query.only_kind(Memory::Z_COPY_MEM); zc_query.has_affinity_to(*it); diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index e670380901..a17e156f18 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -493,10 +493,7 @@ OpMeta *AddBiasResidualLayerNorm::init_task( Runtime *runtime) { AddBiasResidualLayerNorm *ln = (AddBiasResidualLayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); AddBiasResidualLayerNormMeta *meta = new AddBiasResidualLayerNormMeta(handle, ln, gpu_mem_allocator); diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index a52ce1886b..1892ac2353 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -233,10 +233,7 @@ OpMeta *ArgMax::init_task(Task const *task, ctx, task->regions[1].region.get_index_space()); int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; int batch_size = acc_input.domain.get_volume() / length; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); ArgMaxMeta *m = new ArgMaxMeta(handle, diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 97afc94341..203662d3ec 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -514,10 +514,7 @@ OpMeta * acc_output.rect.hi[1] - acc_output.rect.lo[1] + 1); assert(attn->oProjSize == acc_output.rect.hi[0] - acc_output.rect.lo[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MultiHeadAttentionMeta *m = new MultiHeadAttentionMeta(handle, attn, gpu_mem, num_samples, num_heads); m->profiling = attn->profiling; diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp index 106e5ebad2..7dee6fdaaf 100644 --- a/src/ops/batch_norm.cpp +++ b/src/ops/batch_norm.cpp @@ -61,10 +61,7 @@ __host__ OpMeta * int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1; int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); BatchNormMeta *m = new BatchNormMeta( handle, bm, gpu_mem, output_n, output_c, output_h, output_w); return m; diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu index b77e9d489f..929ebf81f8 100644 --- a/src/ops/batch_norm.cu +++ b/src/ops/batch_norm.cu @@ -58,10 +58,7 @@ __host__ OpMeta * int output_c = acc_output.rect.hi[2] - acc_output.rect.lo[2] + 1; int output_n = acc_output.rect.hi[3] - acc_output.rect.lo[3] + 1; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); BatchNormMeta *m = new BatchNormMeta( handle, bm, gpu_mem, output_n, output_c, output_h, output_w); return m; diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index d2054cacb0..5f4547ace5 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -271,10 +271,7 @@ OpMeta *BeamTopK::init_task(Task const *task, Runtime *runtime) { BeamTopK *topk = (BeamTopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); BeamTopKMeta *m = new BeamTopKMeta(handle, topk, gpu_mem_allocator); m->profiling = topk->profiling; diff --git a/src/ops/dropout.cc b/src/ops/dropout.cc index 58cb82d53d..190d6fd496 100644 --- a/src/ops/dropout.cc +++ b/src/ops/dropout.cc @@ -164,10 +164,7 @@ OpMeta *Dropout::init_task(Task const *task, ctx, task->regions[0].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); assert(input_domain == output_domain); DropoutMeta *m = new DropoutMeta(handle, dropout, gpu_mem, output_domain); std::strcpy(m->op_name, dropout->name); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 8688585788..aa60d0f19c 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -698,10 +698,7 @@ OpMeta *IncMultiHeadSelfAttention::init_task( assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (attn->offload) { // cpu-offload enabled diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index 2218ffe392..b19f400eb2 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -380,10 +380,7 @@ OpMeta *LayerNorm::init_task(Task const *task, Runtime *runtime) { LayerNorm *ln = (LayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); LayerNormMeta *meta = new LayerNormMeta(handle, ln, gpu_mem_allocator); std::strcpy(meta->op_name, ln->name); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 0c7a0f78fe..44b56d623e 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -480,10 +480,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, // in_dim, // out_dim, // batch_size); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (linear->offload) { // cpu-offload enabled diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index ed9252c309..8dd670eea3 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -489,10 +489,7 @@ OpMeta *ResidualLayerNorm::init_task(Task const *task, Runtime *runtime) { ResidualLayerNorm *ln = (ResidualLayerNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); ResidualLayerNormMeta *meta = new ResidualLayerNormMeta(handle, ln, gpu_mem_allocator); diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index f4f5bb72d0..b3ee7179d0 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -347,10 +347,7 @@ OpMeta *ResidualRMSNorm::init_task(Task const *task, Runtime *runtime) { ResidualRMSNorm *rn = (ResidualRMSNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); ResidualRMSNormMeta *meta = new ResidualRMSNormMeta(handle, rn, gpu_mem_allocator); diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index bf07ee6bb0..79dce65c57 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -294,10 +294,7 @@ OpMeta *RMSNorm::init_task(Task const *task, Runtime *runtime) { RMSNorm *rn = (RMSNorm *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); RMSNormMeta *meta = new RMSNormMeta(handle, rn, gpu_mem_allocator); std::strcpy(meta->op_name, rn->name); diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index 9fc2316f9a..b38c68843b 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -226,10 +226,7 @@ OpMeta *Sampling::init_task(Task const *task, int length = acc_input.domain.hi()[0] - acc_input.domain.lo()[0] + 1; int batch_size = acc_input.domain.get_volume() / length; - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); SamplingMeta *m = new SamplingMeta( handle, s, batch_size, length * batch_size, acc_input, gpu_mem_allocator); diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3ddd6b8d6e..3d1c8d9094 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -237,10 +237,7 @@ OpMeta *SigmoidSiluMulti::init_task(Task const *task, Runtime *runtime) { SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); SigmoidSiluMultiMeta *meta = new SigmoidSiluMultiMeta(handle, ssm, gpu_mem_allocator); diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 9c6ed0e0b6..68d3a4c205 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -640,10 +640,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( int num_kv_heads = attn->num_kv_heads; assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); // We don't do offloading for SSMs (small speculative models) SpecIncMultiHeadSelfAttentionMeta *m = diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 9b8c88420d..df722a3d51 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -697,10 +697,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (attn->offload) { // cpu-offload enabled diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index cf75235ae7..b023aced6e 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -1914,10 +1914,7 @@ std::pair, std::unordered_map> model->config.workersPerNode, model->config.cpusPerNode, model->all_valid_views); - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MachineModel *machine; if (model->config.machine_model_version == 0) { machine = diff --git a/src/runtime/memory_allocator.cc b/src/runtime/memory_allocator.cc index 06a7c468a4..cb4e867165 100644 --- a/src/runtime/memory_allocator.cc +++ b/src/runtime/memory_allocator.cc @@ -19,7 +19,9 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; +using Legion::Machine; using Legion::Memory; +using Legion::Processor; using Realm::RegionInstance; MemoryAllocator::MemoryAllocator(Memory _memory) @@ -51,4 +53,14 @@ void MemoryAllocator::register_reserved_work_space(void *base, size_t size) { reserved_allocated_size = 0; } +// Now it's for allocating FB memory, in the future we can +// add more types of memory allocation if needed +Memory get_proc_mem(Machine machine, Processor proc) { + Machine::MemoryQuery proc_mem = Machine::MemoryQuery(machine) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(proc); + assert(proc_mem.count() > 0); + return proc_mem.first(); +} + }; // namespace FlexFlow diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5cad628743..f1e222e6e3 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -4273,8 +4273,8 @@ void FFConfig::parse_args(char **argv, int argc) { workersPerNode = atoi(argv[++i]); continue; } - if (!strcmp(argv[i], "-ll:fsize")) { - device_mem = atoi(argv[++i]); + if ((!strcmp(argv[i], "-ll:fsize")) || (!strcmp(argv[i], "-ll:msize"))) { + device_mem += atoi(argv[++i]); continue; } if (!strcmp(argv[i], "--nodes")) { diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index ad2b781567..62f6b89b7f 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -112,10 +112,7 @@ FFHandler // handle.workSpace = memFBImpl->get_direct_ptr(offset, 0); { // allocate memory for workspace - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.workSpaceSize - 1)); @@ -133,10 +130,7 @@ FFHandler } if (handle.offload_reserve_space_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); @@ -157,10 +151,7 @@ FFHandler } if (handle.batch_config_metadata_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 23b7f0efbe..fd39ed0db0 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -108,10 +108,7 @@ FFHandler // handle.workSpace = memFBImpl->get_direct_ptr(offset, 0); { // allocate memory for workspace - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.workSpaceSize - 1)); @@ -129,10 +126,7 @@ FFHandler } if (handle.offload_reserve_space_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.offload_reserve_space_size - 1)); @@ -153,10 +147,7 @@ FFHandler } if (handle.batch_config_metadata_size > 0) { // allocate memory for offload reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), Realm::Point<1, coord_t>(handle.batch_config_metadata_size - 1)); From 6d710acd79f968f65397874f62b8ebef20590620 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Tue, 20 Aug 2024 14:06:52 -0600 Subject: [PATCH 16/44] pip flexflow_python typo (#1461) Co-authored-by: Zhihao Jia --- python/flexflow/flexflow_python | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/flexflow/flexflow_python b/python/flexflow/flexflow_python index cf247b9ede..8a9b65a404 100644 --- a/python/flexflow/flexflow_python +++ b/python/flexflow/flexflow_python @@ -6,7 +6,7 @@ python_packages=$(python -c "from distutils import sysconfig; print(sysconfig.ge pylib_path="$(python "$python_packages"/flexflow/findpylib.py)" pylib_dir="$(dirname "$pylib_path")" export PATH="${python_packages}/flexflow/bin:${PATH}" -export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${pylib_dir}:${PATH}" +export LD_LIBRARY_PATH="${python_packages}/flexflow/lib:${pylib_dir}:${LD_LIBRARY_PATH}" legion_python_args=("$@" "-ll:py" "1") legion_python "${legion_python_args[@]}" From 3b59f0577cc6fc3a109921f72ceadef3458cf635 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 29 Aug 2024 00:04:28 +0200 Subject: [PATCH 17/44] update legion version --- deps/legion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/legion b/deps/legion index 02eb1010ca..0d32b35542 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 02eb1010ca9eb449d345a0db97eab17efb0e5af0 +Subproject commit 0d32b35542bc0e9aba5950e485b8fc3413ae664b From 28aff70cc98d065390eb58b7fd15dcd24f3fb786 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 31 Aug 2024 06:00:57 -0700 Subject: [PATCH 18/44] Fix nccl-induced segfault (#1481) --- include/flexflow/model.h | 1 + src/runtime/model.cc | 68 ++++++++++++++++++---------------- src/runtime/request_manager.cc | 3 ++ 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index ea64f65a95..6dda67bbfe 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -1079,6 +1079,7 @@ class FFModel { bool use_propagation) const; #ifdef FF_USE_NCCL ncclComm_t *find_nccl_comms(MachineView const &view) const; + void finish_nccl_comms(); #endif #ifdef FF_USE_PROPAGATE void propagate(std::map const ¤t, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f1e222e6e3..4c67de1aa9 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1589,41 +1589,47 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +#ifdef FF_USE_NCCL +void FFModel::finish_nccl_comms() { + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } +} +#endif + FFModel::~FFModel() { // Destroy nccl communication groups #ifdef FF_USE_NCCL if (config.computationMode == COMP_MODE_TRAINING) { - Context ctx = config.lg_ctx; - Runtime *runtime = config.lg_hlr; - for (auto const &comm : view_hash_to_nccl_comms) { - // Find the machine view that has the hash - MachineView view; - for (size_t l = 0; l < operators.size(); l++) { - view = operators[l]->outputs[0]->machine_view; - if (view.hash() == comm.first) { - break; - } - } - assert(view.hash() == comm.first && "Cannot find the machine view"); - IndexSpace task_is = get_or_create_task_is(view); - Domain domain = runtime->get_index_space_domain(ctx, task_is); - ArgumentMap argmap; - int idx = 0; - for (Domain::DomainPointIterator it(domain); it; it++, idx++) { - argmap.set_point(*it, - TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); - } - IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, - task_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - comm.first); - FutureMap fm = runtime->execute_index_space(ctx, index_launcher); - fm.wait_all_results(); - } + finish_nccl_comms(); } #endif } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index d21285eef2..bada87ab19 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2365,6 +2365,9 @@ void RequestManager::background_serving_task( // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); } +#ifdef FF_USE_NCCL + llm->finish_nccl_comms(); +#endif } /*static*/ From 49523d62691039a9a8c29891acc5d48641048cc4 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 2 Sep 2024 03:05:25 -0700 Subject: [PATCH 19/44] Fix python install issue caused by new Legion version (#1482) * fix * . * . * fix * cleanup * fix * cleanup --- CMakeLists.txt | 20 ++++++++++++++------ cmake/pip_install/CMakeLists.txt | 20 ++++++++++++++++++-- pyproject.toml | 3 ++- requirements.txt | 1 + 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7a6391e06..c82a53644e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,13 +37,24 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) STRING "Choose the type of build." FORCE) endif() +# option for using Python +option(FF_USE_PYTHON "Enable Python" ON) +if (FF_USE_PYTHON) + find_package(Python3 COMPONENTS Interpreter Development) +endif() + if(INSTALL_DIR) message(STATUS "INSTALL_DIR: ${INSTALL_DIR}") set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE) else() - # Install DIR not set. Use default, unless a conda environment is active - if (DEFINED ENV{CONDA_PREFIX} AND NOT FF_BUILD_FROM_PYPI) - set(CONDA_PREFIX $ENV{CONDA_PREFIX}) + # Install DIR not set. Use default, unless a conda environment is in use + if ((DEFINED ENV{CONDA_PREFIX} OR (Python3_EXECUTABLE AND Python3_EXECUTABLE MATCHES "conda")) AND NOT FF_BUILD_FROM_PYPI) + if (DEFINED ENV{CONDA_PREFIX}) + set(CONDA_PREFIX $ENV{CONDA_PREFIX}) + else() + get_filename_component(CONDA_PREFIX "${Python3_EXECUTABLE}" DIRECTORY) + get_filename_component(CONDA_PREFIX "${CONDA_PREFIX}" DIRECTORY) + endif() # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE) message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") @@ -64,9 +75,6 @@ option(FF_BUILD_FROM_PYPI "Build from pypi" OFF) # build shared or static flexflow lib option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ON) -# option for using Python -option(FF_USE_PYTHON "Enable Python" ON) - # option for building legion only option(BUILD_LEGION_ONLY "Build Legion only" OFF) diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 105133a310..217d7e14f0 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -2,9 +2,25 @@ if (FF_USE_PYTHON) execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) - install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") + cmake_path(SET CMAKE_SOURCE_DIR_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion) + cmake_path(SET CMAKE_BUILD_DIR_ NORMALIZE ${Legion_BINARY_DIR}/runtime) + cmake_path(SET CMAKE_INSTALL_PREFIX_ NORMALIZE ${PY_DEST}/../../..) + cmake_path(SET WORKING_DIRECTORY_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/) # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + # CMAKE_SOURCE_DIR_=/usr/FlexFlow/deps/legion + # CMAKE_BUILD_DIR_: /usr/FlexFlow/build//deps/legion/runtime + # CMAKE_INSTALL_PREFIX_: /opt/conda/ or /usr/local + # WORKING_DIRECTORY_: /usr/FlexFlow/deps/legion/bindings/python/ + # PY_DEST: /python3.11/site-packages + message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}") + message(STATUS "Legion_BINARY_DIR: ${Legion_BINARY_DIR}") + message(STATUS "CMAKE_SOURCE_DIR_: ${CMAKE_SOURCE_DIR_}") + message(STATUS "CMAKE_BUILD_DIR_: ${CMAKE_BUILD_DIR_}") + message(STATUS "CMAKE_INSTALL_PREFIX_: ${CMAKE_INSTALL_PREFIX_}") + message(STATUS "WORKING_DIRECTORY_: ${WORKING_DIRECTORY_}") + message(STATUS "PY_DEST: ${PY_DEST}") + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${CMAKE_INSTALL_PREFIX_} \")") + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E env CMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR_} CMAKE_BUILD_DIR=${CMAKE_BUILD_DIR_} CMAKE_INSTALL_PREFIX=${PY_DEST}/flexflow ${Python3_EXECUTABLE} setup.py install --prefix ${CMAKE_INSTALL_PREFIX_} ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${WORKING_DIRECTORY_} COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)") endif() endif() diff --git a/pyproject.toml b/pyproject.toml index 4b8214f3fe..373c53beb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ requires = [ "setuptools_scm[toml]>=6.0", "cmake-build-extension", "ninja", - "requests" + "requests", + "pip", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 1037661337..ad65622367 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ onnx transformers>=4.31.0 sentencepiece einops +pip From a0f1ed783e3ef48ac374563cf3f4fc2388f34b4c Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 4 Sep 2024 14:15:06 -0400 Subject: [PATCH 20/44] PEFT support (inference/finetuning) (#1153) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * . * . * Update the default cublas behavior when CUDA_VERSION is not specified * fix bugs in IncMHA peft_bwd kernel * uncomment softmaxbackward * add layernorm to align test * add peft test scripts * fix import * fix * add code to convert peft models * add script to download peft for c++, fix bug * fix * add script to fine-tune models * implement loading lora configs/weights from file * remove peft_bwd assertion failure in embedding * fix download script * add peft dependencies in dockerfile * fix softmax backward * fix bc print indentation * Temporarily Revert "Update the default cublas behavior when CUDA_VERSION is not specified" This reverts commit 4ee710a76ee4f47b4574c57519e2b0fb96efaa6a. * Fix cublas default (#1220) * Fix Legion prebuild workflow (2) (#1208) * fix * fix * fix * fix * Fix Legion prebuild workflow (3) (#1210) * fix hip error * use CUBLAS_COMPUTE_FAST_16F for full-precision gemm --------- Co-authored-by: Zhihao Jia * fix bugs, work on align opt-lora * update scripts * add code to output peft tensors in hf * update, fixes * linting * fix printing of tensors for numpy * update save_inference_tensors_to_file * linting * update * fix issue with save_inference_tensors_to_file * fix layer names for save_inference_tensors_to_file * fix peft * fix bwd bugs * linting * fixes * fix * fix * fix * add bc fields for peft training * linting * fix * remove ptr check * fix * implement save_operators for bwd * fix bug * implement save tensors for bwd * . * bug fix * fix * align linear * fix * bwd kernel updates * undo use of CUBLAS_COMPUTE_32F_FAST_16F for now * only send dataset entry once * update peft test scripts * loss * . * update generate/request api to take both inference and fine-tuning prompts * linting * alignment fixes in lora & linear layer * alignment fix * diagonal * fix * alignment fix ssm * sigmoid-silu-multi now fully aligned * rms norm kernel updates * fix * in-place residual rms * bug fix and linting * align backward of o_proj, attn_heads, qk_prods_softmax, and v_proj with huggingface * cleanup * finished all alignment fixes in attention backward kernel * fix * Update inc_multihead_self_attention.cu * Update inc_multihead_self_attention.cu * use grad to store peft in/output (#1241) * use grad to store peft in/output * format * . * format * enable peft request * several hacks for performance measurement; some of the changes should be reverted * Update sigmoid_silu_multi.cu * RoPE backward * PEFT bug fixes and alignment (#1269) * Revert "several hacks for performance measurement; some of the changes should be reverted" This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c. * backup * backup * updates * update * backup * backup * backup * fix * cleanup * linting * Fuse bias + relu in OPT (#1271) * fuse bias and relu in opt * fix * fix * fix * fix * Peft alignment & debugging tools (#1288) * Revert "several hacks for performance measurement; some of the changes should be reverted" This reverts commit b9c392631b596db788ead74fe76d08d80a487b7c. * backup * backup * updates * update * backup * backup * backup * fix * cleanup * fix * fix * fix * update * simplify tensor names * fix * fixes and updates * fixes * fix * cleanup * . * restore softmax * cleanup * update alignment scripts * newline * fix legion aliasing error * fix warnings * fix * fix pipeline parallelism * fix tp issue in combine op * fix lora weight loading with tensor parallelism * fixes, implement Combine::peft_bwd_task * fix * replicate peft bwd * fixes * fix * fix combine and fwd-bwd pass dependencies * fix replicate bwd * fix * let user control amount of peft memory * only run peft_bwd if peft is enabled * fix rms norm inference region reqs * fix in-place fusion (part 1) * fix inplace fusion (part 2) * fix * disable automatic inplace rms norm for now * fix inf fusion inplace * fix rest input grads for peft without inplace residuals * fix * fix * fix residual rms * fix * fix * enable inf debugging in fusion bwd * hack to silence warning in fused bwd * fix * fix * fix build * fix * fix * add draft peft test * Peft python interface (#1306) * update script * less model renaming * fix * fix * fix * backup * . * update * . * fixes * fix * fix build * fix * fix * fix issues for downloading peft model * solved issues for download peft model * added printouts for debugging * fix * fix seg fault * add test, separate peft script in cpp * fix * fixes * fix * update peft python interface * update * update * update * updates * fix * fixes * fix * fixes --------- Co-authored-by: april-yyt * fix * update * fix * fix to support prompts larger than max tokens per batch * fixes to support benchmarking of finetuning throughput * many upgrades and updates related to finetuning * add ttft statistics * add warmup phase * add benchmarking code * Add scripts for evaluation with Microsoft Azure trace (#1363) * Add scripts for evaluation * Add absolute request rate value * Fix script for target arrival rate * Fix cpp req rate benchmark * update to use new dataset * Fix infinite loop * update * add data --------- Co-authored-by: Remi Delacourt Co-authored-by: Gabriele Oliaro * fix * fix * add peft tests to ci * shellcheck * fix * fix python requirements * fix * fix * update ci test * update alignment doc * fix cross entropy loss bug * update alignment test * update test * add llama peft alignment test to ci * Fix values for unused params in incr_decoding * Add PEFTModelID NO_ID singleton instead of None * Fix PEFTModelID::NO_ID reference * reduce logging * fix * fix * Add peft demo * Add readme for demo * fix alignment issue * Peft optimizer (#1290) * add optimizer config, only allocate weights for training * sgd 1 * sgd 2 * update * fix * linting * . * . * fix * fix allreduce bug * update * update * add optimizer hook in hf * update * update script * . * fix * fwd * bwd * start grads * fix gradient misalignment! * update * Add support for llama3 * various fixes --------- Co-authored-by: Remi Delacourt * Optimizers python interface (#1441) * python interface for optimizer * update lora linear config to support python interface * update python interface * finished lora python interface * fix * fix * update * update * more fixes * fix * initialize lora weights where needed * Add notebook * Update demo to use dataset * Fix' * Save weights after end of finetuning (#1446) * support accumulation of gradients without update * add code to save peft weights * fix * save configs * cleanup * Fully use notebook for demo * Parameterize generation and finetuning configs * Comment out inference for now * fix bug in lora inference only mode * fix * Add finetuning or inference only flags * fix * fix * fix * PEFT model upload (#1450) * upload test * fix * Make demo_class.py executable * fix * add base_model_name_or_path * fix * fix * support llama-3 tokenizer * print output tokens when not benchmarking * Use Llama3 in demo_class * Use Llama3 in demo * fix data loading for llama-3 * Add download models to demo * return/print loss at each finetuning step * fix * Adjust demo parameters * Fix for finetuning * pass finetuning losses to python interface * Update demo * Fix upload * Refactor demo * rename demo_class to demo * fix * remove epoch from loss print * Finish demo * fix test * rocm fixes * more rocm fixes * fix rocm build * docker fix * fix inference test * fix workflow * fix makefile * fix peft test * fix all-reduce issue with lora for TP scenario * fix bwd lm head * fixes * more fixes * update * fix alignment up to input ln * finished aligning all backward (tp>1) * align all peft * fix * fix broken link * formatting * fix * update * Revert "update" This reverts commit 90b2c876ca3ea9c29e59aa7ae9904f254298660d. * update * fix hip build * fix gpu ci * fix gpu ci * update default gpu ci version to 12.0 * update ci to 12.0 * fix * fix * update * fix * fix * update * fix * add cleanup * downgrade to cuda=11.8 --------- Co-authored-by: Gabriele Oliaro Co-authored-by: xinhaoc Co-authored-by: Xinhao Cheng <99570243+xinhaoc@users.noreply.github.com> Co-authored-by: april-yyt Co-authored-by: Remi <54138269+Flechman@users.noreply.github.com> Co-authored-by: Remi Delacourt Co-authored-by: Rémi Delacourt --- .github/workflows/build.yml | 12 +- .github/workflows/gpu-ci.yml | 10 + .github/workflows/helpers/install_cudnn.sh | 23 +- .github/workflows/helpers/install_nccl.sh | 8 +- .github/workflows/multinode-test.yml | 6 +- .github/workflows/pip-install.yml | 4 +- .github/workflows/prebuild-legion.yml | 4 +- .gitignore | 5 + CMakeLists.txt | 1 + conda/flexflow.yml | 7 + config/config.inc | 2 +- docker/build.sh | 9 +- docker/flexflow-environment/Dockerfile | 2 + docker/run.sh | 2 +- include/flexflow/batch_config.h | 42 +- include/flexflow/config.h | 41 +- include/flexflow/ffconst.h | 77 +- include/flexflow/fftype.h | 25 + include/flexflow/flexflow_c.h | 136 +- include/flexflow/inference.h | 1 + include/flexflow/layer.h | 2 +- include/flexflow/model.h | 61 +- include/flexflow/op_meta.h | 6 +- include/flexflow/operator.h | 95 +- include/flexflow/operator_params.h | 4 + .../ops/add_bias_residual_layer_norm.h | 63 +- .../ops/add_bias_residual_layer_norm_params.h | 1 + include/flexflow/ops/aggregate.h | 4 +- include/flexflow/ops/aggregate_spec.h | 4 +- include/flexflow/ops/argmax.h | 11 +- include/flexflow/ops/cache.h | 4 +- include/flexflow/ops/element_unary.h | 4 +- include/flexflow/ops/embedding.h | 5 + include/flexflow/ops/experts.h | 17 +- include/flexflow/ops/fused.h | 9 + include/flexflow/ops/groupby.h | 4 +- .../ops/inc_multihead_self_attention.h | 23 +- .../ops/kernels/batch_matmul_kernels.h | 4 +- include/flexflow/ops/kernels/cast_kernels.h | 4 +- include/flexflow/ops/kernels/concat_kernels.h | 4 +- .../flexflow/ops/kernels/conv_2d_kernels.h | 4 +- include/flexflow/ops/kernels/flat_kernels.h | 4 +- .../inc_multihead_self_attention_utils.cuh | 27 +- include/flexflow/ops/kernels/linear_kernels.h | 31 + .../ops/kernels/lora_linear_kernels.h | 77 + .../flexflow/ops/kernels/pool_2d_kernels.h | 4 +- .../flexflow/ops/kernels/reshape_kernels.h | 6 +- .../ops/kernels/residual_rms_norm_kernels.h | 30 +- .../flexflow/ops/kernels/rms_norm_kernels.h | 23 +- .../flexflow/ops/kernels/softmax_kernels.h | 46 +- .../flexflow/ops/kernels/transpose_kernels.h | 4 +- include/flexflow/ops/layer_norm.h | 50 +- include/flexflow/ops/linear.h | 9 + include/flexflow/ops/lora_linear.h | 99 + include/flexflow/ops/lora_linear_params.h | 150 + include/flexflow/ops/residual_layer_norm.h | 41 +- .../flexflow/ops/residual_layer_norm_params.h | 1 + include/flexflow/ops/residual_rms_norm.h | 16 + .../flexflow/ops/residual_rms_norm_params.h | 1 + include/flexflow/ops/rms_norm.h | 13 + include/flexflow/ops/sigmoid_silu_multi.h | 33 +- include/flexflow/ops/softmax.h | 9 + include/flexflow/ops/topk.h | 4 +- include/flexflow/ops/transpose.h | 2 + .../ops/tree_inc_multihead_self_attention.h | 2 +- include/flexflow/parallel_ops/allreduce.h | 19 +- include/flexflow/parallel_ops/combine.h | 13 + .../parallel_ops/kernels/allreduce_kernels.h | 14 +- .../parallel_ops/kernels/combine_kernels.h | 4 +- .../kernels/parallel_identity_kernels.h | 41 + .../parallel_ops/kernels/partition_kernels.h | 4 +- .../flexflow/parallel_ops/parallel_identity.h | 83 + .../parallel_ops/parallel_identity_params.h | 22 + include/flexflow/parallel_ops/parallel_op.h | 2 +- include/flexflow/parallel_ops/replicate.h | 9 + include/flexflow/request_manager.h | 44 +- include/flexflow/simulator.h | 56 +- include/flexflow/utils/cuda_helper.h | 13 +- include/flexflow/utils/hip_helper.h | 33 +- include/flexflow/utils/memory_allocator.h | 5 + .../flexflow/utils/peft_weight_allocator.h | 92 + inference/MODEL_WEIGHTS.md | 28 - inference/README.md | 42 + inference/incr_decoding/incr_decoding.cc | 11 +- inference/models/falcon.cc | 16 +- inference/models/llama.cc | 112 +- inference/models/llama.h | 11 +- inference/models/mpt.cc | 23 +- inference/models/opt.cc | 27 +- inference/models/starcoder.cc | 19 +- inference/peft/CMakeLists.txt | 139 + inference/peft/Makefile | 37 + inference/peft/peft.cc | 387 ++ inference/peft/peft_bwd_benchmark.cc | 391 ++ inference/peft/peft_fwd_benchmark.cc | 363 ++ inference/peft/req_rate_benchmark.cc | 518 ++ inference/python/ff_peft.py | 189 + inference/python/incr_decoding.py | 5 +- inference/python/peft_demo/INSTRUCTIONS.md | 25 + inference/python/peft_demo/demo.ipynb | 1907 +++++++ inference/python/peft_demo/demo.py | 240 + inference/python/spec_infer.py | 7 +- inference/spec_infer/spec_infer.cc | 11 +- inference/utils/download_peft_model.py | 68 + inference/utils/upload_peft_model.py | 142 + python/flexflow/core/__init__.py | 5 +- python/flexflow/core/flexflow_cffi.py | 5024 +++++++++-------- python/flexflow/serve/__init__.py | 43 +- python/flexflow/serve/models/base.py | 3 + python/flexflow/serve/models/falcon.py | 41 +- python/flexflow/serve/models/llama.py | 48 +- python/flexflow/serve/models/mpt.py | 46 +- python/flexflow/serve/models/opt.py | 51 +- python/flexflow/serve/models/starcoder.py | 47 +- python/flexflow/serve/serve.py | 446 +- python/flexflow/type.py | 11 + rdelacou/generate_trace.py | 121 + requirements.txt | 8 + src/c/flexflow_c.cc | 382 +- src/loss_functions/loss_functions.cpp | 8 +- src/loss_functions/loss_functions.cu | 8 +- src/ops/add_bias_residual_layer_norm.cc | 607 +- src/ops/add_bias_residual_layer_norm.cpp | 748 ++- src/ops/add_bias_residual_layer_norm.cu | 609 +- src/ops/aggregate.cc | 6 +- src/ops/aggregate.cpp | 9 +- src/ops/aggregate.cu | 7 +- src/ops/aggregate_spec.cc | 6 +- src/ops/aggregate_spec.cpp | 7 +- src/ops/aggregate_spec.cu | 7 +- src/ops/arg_topk.cc | 11 +- src/ops/argmax.cc | 42 +- src/ops/argmax.cpp | 81 +- src/ops/argmax.cu | 86 +- src/ops/attention.cc | 2 +- src/ops/attention.cpp | 2 +- src/ops/attention.cu | 2 +- src/ops/batch_matmul.cc | 4 +- src/ops/batch_norm.cpp | 2 +- src/ops/batch_norm.cu | 2 +- src/ops/beam_topk.cc | 10 +- src/ops/beam_topk.cpp | 2 +- src/ops/beam_topk.cu | 2 +- src/ops/cache.cc | 2 +- src/ops/cache.cpp | 2 +- src/ops/cache.cu | 2 +- src/ops/cast.cc | 2 +- src/ops/concat.cc | 4 +- src/ops/conv_2d.cc | 17 +- src/ops/element_binary.cc | 10 +- src/ops/element_unary.cc | 4 +- src/ops/element_unary.cpp | 3 +- src/ops/element_unary.cu | 3 +- src/ops/embedding.cc | 18 +- src/ops/experts.cc | 17 +- src/ops/experts.cpp | 30 +- src/ops/experts.cu | 65 +- src/ops/flat.cc | 3 +- src/ops/fused.cc | 234 +- src/ops/fused.cpp | 1257 +++-- src/ops/fused.cu | 1410 +++-- src/ops/group_by.cc | 6 +- src/ops/group_by.cpp | 6 +- src/ops/group_by.cu | 6 +- src/ops/inc_multihead_self_attention.cc | 139 +- src/ops/inc_multihead_self_attention.cpp | 1782 ++++-- src/ops/inc_multihead_self_attention.cu | 756 ++- src/ops/kernels/batch_matmul.cpp | 4 +- src/ops/kernels/batch_matmul.cu | 4 +- src/ops/kernels/cast_kernels.cpp | 3 +- src/ops/kernels/cast_kernels.cu | 3 +- src/ops/kernels/concat_kernels.cpp | 4 + src/ops/kernels/concat_kernels.cu | 4 + src/ops/kernels/conv_2d_kernels.cpp | 10 +- src/ops/kernels/conv_2d_kernels.cu | 10 +- src/ops/kernels/dropout_kernels.cpp | 2 +- src/ops/kernels/dropout_kernels.cu | 2 +- src/ops/kernels/flat_kernels.cpp | 4 + src/ops/kernels/flat_kernels.cu | 4 + src/ops/kernels/linear_kernels.cpp | 423 +- src/ops/kernels/linear_kernels.cu | 268 +- src/ops/kernels/lora_linear_kernels.cpp | 576 ++ src/ops/kernels/lora_linear_kernels.cu | 579 ++ src/ops/kernels/pool_2d_kernels.cpp | 4 +- src/ops/kernels/pool_2d_kernels.cu | 4 +- src/ops/kernels/reshape_kernels.cpp | 4 +- src/ops/kernels/reshape_kernels.cu | 4 +- src/ops/kernels/residual_rms_norm_kernels.cpp | 438 +- src/ops/kernels/residual_rms_norm_kernels.cu | 454 +- src/ops/kernels/rms_norm_kernels.cpp | 396 +- src/ops/kernels/rms_norm_kernels.cu | 444 +- src/ops/kernels/softmax.cpp | 284 +- src/ops/kernels/softmax.cu | 275 +- src/ops/kernels/transpose_kernels.cpp | 4 + src/ops/kernels/transpose_kernels.cu | 4 + src/ops/layer_norm.cc | 181 +- src/ops/layer_norm.cpp | 479 +- src/ops/layer_norm.cu | 352 +- src/ops/linear.cc | 154 +- src/ops/lora_linear.cc | 1316 +++++ src/ops/lora_linear_params.cc | 221 + src/ops/mean.cc | 3 +- src/ops/noop.cc | 7 +- src/ops/pool_2d.cc | 4 +- src/ops/reduce.cc | 2 +- src/ops/reduce.cpp | 2 +- src/ops/reduce.cu | 2 +- src/ops/reshape.cc | 4 +- src/ops/residual_layer_norm.cc | 521 +- src/ops/residual_layer_norm.cpp | 695 ++- src/ops/residual_layer_norm.cu | 690 ++- src/ops/residual_rms_norm.cc | 512 +- src/ops/rms_norm.cc | 168 +- src/ops/sampling.cc | 6 +- src/ops/sigmoid_silu_multi.cc | 187 +- src/ops/sigmoid_silu_multi.cpp | 297 +- src/ops/sigmoid_silu_multi.cu | 264 +- src/ops/softmax.cc | 261 +- src/ops/spec_inc_multihead_self_attention.cc | 2 +- src/ops/spec_inc_multihead_self_attention.cpp | 17 +- src/ops/spec_inc_multihead_self_attention.cu | 66 +- src/ops/split.cc | 2 +- src/ops/topk.cc | 6 +- src/ops/topk.cpp | 3 +- src/ops/topk.cu | 3 +- src/ops/transpose.cc | 6 +- src/ops/tree_inc_multihead_self_attention.cc | 2 +- src/ops/tree_inc_multihead_self_attention.cpp | 654 ++- src/ops/tree_inc_multihead_self_attention.cu | 58 +- src/parallel_ops/allreduce.cc | 287 +- src/parallel_ops/combine.cc | 151 +- src/parallel_ops/fused_parallel_op.cc | 2 +- .../kernels/allreduce_kernels.cpp | 52 +- src/parallel_ops/kernels/allreduce_kernels.cu | 48 +- src/parallel_ops/kernels/combine_kernels.cpp | 4 +- src/parallel_ops/kernels/combine_kernels.cu | 4 +- .../kernels/parallel_identity_kernels.cpp | 97 + .../kernels/parallel_identity_kernels.cu | 96 + .../kernels/partition_kernels.cpp | 4 +- src/parallel_ops/kernels/partition_kernels.cu | 4 +- .../kernels/reduction_kernels.cpp | 2 +- src/parallel_ops/kernels/reduction_kernels.cu | 2 +- .../kernels/replicate_kernels.cpp | 2 +- src/parallel_ops/kernels/replicate_kernels.cu | 2 +- src/parallel_ops/parallel_identity.cc | 474 ++ src/parallel_ops/partition.cc | 10 +- src/parallel_ops/reduction.cc | 17 +- src/parallel_ops/replicate.cc | 91 +- src/runtime/batch_config.cc | 65 +- src/runtime/beam_search_batch_config.cc | 4 + src/runtime/cuda_helper.cu | 200 +- src/runtime/ffconst_utils.cc | 5 + src/runtime/fftype.cc | 25 + src/runtime/file_loader.cc | 92 +- src/runtime/graph.cc | 88 +- src/runtime/hip_helper.cpp | 274 +- src/runtime/inference_manager.cc | 204 +- src/runtime/model.cc | 840 ++- src/runtime/model.cpp | 4 +- src/runtime/model.cu | 45 +- src/runtime/operator.cc | 36 +- src/runtime/operator_params.cc | 3 + src/runtime/request_manager.cc | 768 ++- src/runtime/request_manager.cpp | 45 +- src/runtime/request_manager.cu | 123 +- src/runtime/simulator.cpp | 22 +- src/runtime/simulator.cu | 26 +- src/runtime/substitution.cc | 36 +- src/runtime/tree_verify_batch_config.cc | 4 + tests/.gitignore | 1 - tests/align/test_all_operators.sh | 2 +- tests/cpp_gpu_tests.sh | 4 +- tests/inference/cpp_inference_tests.sh | 64 +- tests/inference/huggingface_inference.py | 14 +- tests/inference/python_inference_tests.sh | 35 +- .../python_test_configs/generate_configs.py | 5 +- tests/peft/alignment/align_test_utils.py | 510 ++ .../alignment/llama_alignment_tests.ipynb | 2651 +++++++++ .../peft/alignment/opt_alignment_tests.ipynb | 450 ++ tests/peft/hf_finetune.py | 129 + tests/peft/hf_serve.py | 140 + tests/peft/hf_train.py | 161 + tests/peft/hf_utils.py | 352 ++ tests/peft/peft_alignment_test.py | 730 +++ tests/peft_test.sh | 66 + 285 files changed, 35212 insertions(+), 6650 deletions(-) create mode 100644 include/flexflow/ops/kernels/lora_linear_kernels.h create mode 100644 include/flexflow/ops/lora_linear.h create mode 100644 include/flexflow/ops/lora_linear_params.h create mode 100644 include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h create mode 100644 include/flexflow/parallel_ops/parallel_identity.h create mode 100644 include/flexflow/parallel_ops/parallel_identity_params.h create mode 100644 include/flexflow/utils/peft_weight_allocator.h delete mode 100644 inference/MODEL_WEIGHTS.md create mode 100644 inference/README.md create mode 100644 inference/peft/CMakeLists.txt create mode 100644 inference/peft/Makefile create mode 100644 inference/peft/peft.cc create mode 100644 inference/peft/peft_bwd_benchmark.cc create mode 100644 inference/peft/peft_fwd_benchmark.cc create mode 100644 inference/peft/req_rate_benchmark.cc create mode 100644 inference/python/ff_peft.py create mode 100644 inference/python/peft_demo/INSTRUCTIONS.md create mode 100644 inference/python/peft_demo/demo.ipynb create mode 100644 inference/python/peft_demo/demo.py create mode 100644 inference/utils/download_peft_model.py create mode 100644 inference/utils/upload_peft_model.py create mode 100644 rdelacou/generate_trace.py create mode 100644 src/ops/kernels/lora_linear_kernels.cpp create mode 100644 src/ops/kernels/lora_linear_kernels.cu create mode 100644 src/ops/lora_linear.cc create mode 100644 src/ops/lora_linear_params.cc create mode 100644 src/parallel_ops/kernels/parallel_identity_kernels.cpp create mode 100644 src/parallel_ops/kernels/parallel_identity_kernels.cu create mode 100644 src/parallel_ops/parallel_identity.cc delete mode 100644 tests/.gitignore create mode 100644 tests/peft/alignment/align_test_utils.py create mode 100644 tests/peft/alignment/llama_alignment_tests.ipynb create mode 100644 tests/peft/alignment/opt_alignment_tests.ipynb create mode 100644 tests/peft/hf_finetune.py create mode 100644 tests/peft/hf_serve.py create mode 100644 tests/peft/hf_train.py create mode 100644 tests/peft/hf_utils.py create mode 100644 tests/peft/peft_alignment_test.py create mode 100755 tests/peft_test.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d05856f1a9..ef5961bc87 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -52,13 +52,14 @@ jobs: run: .github/workflows/helpers/free_space_on_runner.sh - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.11 + uses: Jimver/cuda-toolkit@v0.2.16 if: ${{ matrix.gpu_backend == 'cuda' }} id: cuda-toolkit with: - cuda: "11.8.0" + cuda: "12.1.1" # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement use-github-cache: "false" + log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt' - name: Install system dependencies run: .github/workflows/helpers/install_dependencies.sh @@ -156,11 +157,12 @@ jobs: run: .github/workflows/helpers/free_space_on_runner.sh - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.11 + uses: Jimver/cuda-toolkit@v0.2.16 id: cuda-toolkit with: - cuda: "11.8.0" + cuda: "12.1.1" use-github-cache: "false" + log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt' - name: Install system dependencies run: .github/workflows/helpers/install_dependencies.sh @@ -169,7 +171,7 @@ jobs: uses: conda-incubator/setup-miniconda@v2 with: activate-environment: flexflow - environment-file: conda/environment.yml + environment-file: conda/flexflow.yml auto-activate-base: false - name: Build FlexFlow diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index c7d0cd72cb..00ca2df603 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -181,6 +181,16 @@ jobs: ../config/config.linux make -j + - name: Run PEFT tests + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export CUDNN_DIR=/usr/local/cuda + export CUDA_DIR=/usr/local/cuda + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + + source ./build/set_python_envs.sh + ./tests/peft_test.sh + - name: Run inference tests env: CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh index 7c11a4a420..73b8e88418 100755 --- a/.github/workflows/helpers/install_cudnn.sh +++ b/.github/workflows/helpers/install_cudnn.sh @@ -5,8 +5,11 @@ set -x # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" +ubuntu_version=$(lsb_release -rs) +ubuntu_version=${ubuntu_version//./} + # Install CUDNN -cuda_version=${1:-11.8.0} +cuda_version=${1:-12.1.1} cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') echo "Installing CUDNN for CUDA version: ${cuda_version} ..." CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz @@ -44,8 +47,11 @@ elif [[ "$cuda_version" == "11.7" ]]; then elif [[ "$cuda_version" == "11.8" ]]; then CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz -elif [[ "$cuda_version" == "12.0" ]]; then - echo "CUDNN support for CUDA version 12.0 not yet added" +elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then + CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb + CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb +else + echo "CUDNN support for CUDA version above 12.5 not yet added" exit 1 fi wget -c -q $CUDNN_LINK @@ -55,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version" sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME" +elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then + wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt update -y + rm -f cuda-keyring_1.1-1_all.deb + sudo dpkg -i $CUDNN_TARBALL_NAME + sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/ + sudo apt update -y + sudo apt install -y libcudnn8 + sudo apt install -y libcudnn8-dev + sudo apt install -y libcudnn8-samples else sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local fi diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh index ca88668d84..ae6793ea2a 100755 --- a/.github/workflows/helpers/install_nccl.sh +++ b/.github/workflows/helpers/install_nccl.sh @@ -8,13 +8,13 @@ cd "${BASH_SOURCE[0]%/*}" # Add NCCL key ring ubuntu_version=$(lsb_release -rs) ubuntu_version=${ubuntu_version//./} -wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb" -sudo dpkg -i cuda-keyring_1.0-1_all.deb +wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" +sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt update -y -rm -f cuda-keyring_1.0-1_all.deb +rm -f cuda-keyring_1.1-1_all.deb # Install NCCL -cuda_version=${1:-11.8.0} +cuda_version=${1:-12.1.1} cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') echo "Installing NCCL for CUDA version: ${cuda_version} ..." diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml index 226f953b38..2fc527bf08 100644 --- a/.github/workflows/multinode-test.yml +++ b/.github/workflows/multinode-test.yml @@ -38,7 +38,7 @@ jobs: # 10h timeout, instead of default of 360min (6h) timeout-minutes: 600 container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -87,7 +87,7 @@ jobs: runs-on: self-hosted needs: gpu-ci-concierge container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest options: --gpus all --shm-size=8192m # 10h timeout, instead of default of 360min (6h) timeout-minutes: 600 @@ -138,7 +138,7 @@ jobs: runs-on: self-hosted needs: gpu-ci-concierge container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml index 3562134987..d5acbfc2e1 100644 --- a/.github/workflows/pip-install.yml +++ b/.github/workflows/pip-install.yml @@ -44,10 +44,10 @@ jobs: run: .github/workflows/helpers/free_space_on_runner.sh - name: Install CUDA - uses: Jimver/cuda-toolkit@v0.2.11 + uses: Jimver/cuda-toolkit@v0.2.16 id: cuda-toolkit with: - cuda: "11.8.0" + cuda: "12.1.1" # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement use-github-cache: "false" diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml index 267daaee6b..633fb00eb8 100644 --- a/.github/workflows/prebuild-legion.yml +++ b/.github/workflows/prebuild-legion.yml @@ -23,13 +23,13 @@ jobs: strategy: matrix: gpu_backend: ["cuda", "hip_rocm"] - gpu_backend_version: ["11.8", "5.6"] + gpu_backend_version: ["12.0", "5.6"] python_version: ["3.11"] exclude: - gpu_backend: "cuda" gpu_backend_version: "5.6" - gpu_backend: "hip_rocm" - gpu_backend_version: "11.8" + gpu_backend_version: "12.0" fail-fast: false steps: - name: Checkout Git Repository diff --git a/.gitignore b/.gitignore index 7f6a3c4137..cc34c1a7b6 100644 --- a/.gitignore +++ b/.gitignore @@ -187,4 +187,9 @@ gpt_tokenizer python/flexflow/version.txt inference_tensors +hf_peft_tensors +lora_training_logs + +Untitled-1.ipynb +Untitled-2.ipynb tests/inference/python_test_configs/*.json diff --git a/CMakeLists.txt b/CMakeLists.txt index c82a53644e..f06969ae04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -567,6 +567,7 @@ if(NOT BUILD_LEGION_ONLY) if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(inference/spec_infer) add_subdirectory(inference/incr_decoding) + add_subdirectory(inference/peft) endif() diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 67ef6b3419..091ba929e4 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -25,3 +25,10 @@ dependencies: - sentencepiece - einops - requests + - scipy + - bitsandbytes + - datasets + - accelerate + - loralib + - triton + - peft diff --git a/config/config.inc b/config/config.inc index 7d7b2db9cf..6431eaf136 100644 --- a/config/config.inc +++ b/config/config.inc @@ -197,7 +197,7 @@ fi # set ROCM path if [ -n "$ROCM_PATH" ]; then - SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}" + SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}" fi ADD_ROCM_TO_PATH="" diff --git a/docker/build.sh b/docker/build.sh index 8ecacbc6d4..b68860712f 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -56,15 +56,14 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the cuda_version_input=${cuda_version}.3 elif [[ "$cuda_version" == @(11.8) ]]; then cuda_version_input=${cuda_version}.0 + elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then + # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available) + cuda_version=12.2 + cuda_version_input=${cuda_version}.2 else echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi - # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available) - if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - cuda_version=12.2 - cuda_version_input=${cuda_version}.2 - fi echo "Building $image docker image with CUDA $cuda_version" ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04" gpu_backend_version="-${cuda_version}" diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index cef619ad68..3434916d6b 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -94,6 +94,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1 RUN conda install pytorch torchvision torchaudio -c pytorch RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook +# PEFT-related +RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft # Install Rust RUN curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/docker/run.sh b/docker/run.sh index 666c8e1121..cf105a10c8 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -58,7 +58,7 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the fi fi # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then + if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" exit 1 fi diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 009d1c250a..873fed0bdb 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -16,6 +16,7 @@ #pragma once #include "flexflow/ffconst.h" +#include "flexflow/fftype.h" #include "legion.h" #include #include @@ -36,6 +37,18 @@ using BeamSearchBatchConfigFuture = Legion::Future; using TreeVerifyBatchConfigFuture = Legion::Future; using BeamInferenceResultFuture = Legion::Future; +struct OptimizerTasks { + bool compute_gradients = true; + bool reset_gradients_to_zero = false; + bool update_weights = false; + bool save_updated_weights = false; +}; + +void set_optimizer_tasks(OptimizerTasks &tasks, + int max_training_steps, + int completed_training_steps, + int gradient_accumulation_steps); + class BatchConfig { public: using RequestGuid = size_t; @@ -43,6 +56,8 @@ class BatchConfig { BatchConfig(); int num_active_requests() const; int num_active_tokens() const; + int num_active_infr_tokens() const; + int num_active_peft_tokens() const; static int max_requests_per_batch(); static int max_tokens_per_batch(); static int max_verify_tokens_per_batch(); @@ -56,26 +71,43 @@ class BatchConfig { // Maximum possible values for different parameters // These maximum values are used for copying BatchConfig // across workers - static int const MAX_NUM_REQUESTS = 64; + static int const MAX_NUM_REQUESTS = 65; static int const MAX_NUM_TOKENS = 1024; static int const MAX_SPEC_TREE_TOKEN_NUM = 64; // Set by update - int num_tokens; + + int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0; // number of tokens in prompt phase, start offset of tokens in inc_decoding // phase. num_tokens - num_prompt_tokens = num_generation_tokens; - int num_generation_tokens; + int num_generation_tokens = 0; struct PerRequestInfo { + PerRequestInfo() { + first_token_depth_in_request = 0; + first_token_offset_in_batch = 0; + num_tokens_in_batch = 0; + max_sequence_length = 0; + request_guid = 0; + prompt_phase = false; + batch_config_request_id = -1; + peft_model_id = PEFTModelID::NO_ID; + peft_bwd = false; + optimizer_tasks = {true, false, false, false}; + } int first_token_depth_in_request; int first_token_offset_in_batch; int num_tokens_in_batch; int max_sequence_length; // request id in batch config: - int batch_config_request_id; + int batch_config_request_id = -1; bool prompt_phase = false; RequestGuid request_guid; + // PEFT fields + PEFTModelID peft_model_id; + bool peft_bwd; + OptimizerTasks optimizer_tasks; }; struct PerTokenInfo { int abs_depth_in_request; @@ -102,6 +134,7 @@ class BatchConfig { BitMask causalMask[MAX_NUM_REQUESTS]; PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; + PerTokenInfo labelsInfo[MAX_NUM_TOKENS]; bool request_completed[MAX_NUM_REQUESTS]; bool request_running[MAX_NUM_REQUESTS]; @@ -129,6 +162,7 @@ class TreeVerifyBatchConfig : public BatchConfig { struct InferenceResult { static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS; BatchConfig::TokenId token_ids[MAX_NUM_TOKENS]; + float finetuning_loss; }; class BeamSearchBatchConfig : public BatchConfig { diff --git a/include/flexflow/config.h b/include/flexflow/config.h index 2c11ae1131..dd9d657117 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -65,6 +65,25 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS; #endif class FFConfig; +class MemoryAllocator; +class PEFTWeightAllocator; + +struct CombinedBatchConfigMetaStruct { + BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS]; + BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS]; + BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS]; + bool request_completed[BatchConfig::MAX_NUM_REQUESTS]; + + BeamSearchBatchConfig::BeamSearchPerTokenInfo + beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS + + BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM * + BeamSearchBatchConfig::MAX_NUM_REQUESTS]; + BeamSearchBatchConfig::BeamSearchPerRequestInfo + beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS]; + + TreeVerifyBatchConfig::CommittedTokensInfo + committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS]; +}; struct FFHandler { #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) @@ -76,18 +95,18 @@ struct FFHandler { #endif void *workSpace; size_t workSpaceSize; - void *batch_config_metadata; + CombinedBatchConfigMetaStruct *batch_config_metadata; // request info + token info + topolopgy mask info - size_t batch_config_metadata_size = - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens) + - sizeof(BatchConfig::request_completed); + size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct); void *offload_reserve_space; size_t offload_reserve_space_size; + // PEFT related fields + MemoryAllocator *peft_activation_allocator; + size_t peft_activation_reserve_space_size; + PEFTWeightAllocator *peft_weight_allocator; + size_t peft_weight_reserve_space_size; + // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; #ifdef FF_USE_NCCL @@ -98,6 +117,8 @@ struct FFHandler { struct FFInitInfo { size_t workSpaceSize; size_t offload_reserve_space_size; + size_t peft_activation_reserve_space_size; + size_t peft_weight_reserve_space_size; DataType quantization_type; bool allowTensorOpMathConversion; // int myRank, allRanks; @@ -155,6 +176,10 @@ class FFConfig { bool cpu_offload; size_t offload_reserve_space_size; DataType quantization_type; + // PEFT related fields + bool enable_peft; + size_t peft_activation_reserve_space_size; + size_t peft_weight_reserve_space_size; // Control parallelizable dimensions bool only_data_parallel; bool enable_sample_parallel; diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 512645e624..24b722c36f 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -46,6 +46,12 @@ enum LossType { LOSS_IDENTITY = 54, }; +enum OptimizerType { + OPTIMIZER_TYPE_NONE = 60, + OPTIMIZER_TYPE_SGD = 61, + OPTIMIZER_TYPE_ADAM = 62, +}; + enum CompMode { COMP_MODE_TRAINING = 70, COMP_MODE_INFERENCE = 71, @@ -72,6 +78,11 @@ enum InferenceMode { TREE_VERIFY_MODE = 2003, }; +enum RequestType { + REQ_INFERENCE = 4001, + REQ_FINETUNING = 4002, +}; + // This is consistent with TASO's OpType // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138 enum OperatorType { @@ -172,6 +183,8 @@ enum OperatorType { OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, OP_SAMPLING, + // PEFT Ops + OP_LORA, // Parallel Ops OP_REPARTITION, OP_COMBINE, @@ -179,6 +192,7 @@ enum OperatorType { OP_REDUCTION, OP_PIPELINE, OP_ALLREDUCE, + OP_PARALLEL_IDENTITY, OP_FUSED_PARALLEL, OP_INVALID, }; @@ -193,36 +207,37 @@ enum ModelType { }; enum PMParameter { - PM_OP_TYPE, // AnyOp - PM_NUM_INPUTS, // AnyOp - PM_NUM_OUTPUTS, // AnyOp - PM_GROUP, // Conv2D - PM_KERNEL_H, // Conv2D, Pool2D - PM_KERNEL_W, // Conv2D, Pool2D - PM_STRIDE_H, // Conv2D, Pool2D - PM_STRIDE_W, // Conv2D, Pool2D - PM_PADDING_H, // Conv2D, Pool2D - PM_PADDING_W, // Conv2D, Pool2D - PM_ACTI, // Conv2D, Pool2D - PM_NUMDIM, // Concat, Transpose - PM_AXIS, // Concat, Split - PM_PERM, // Transpose - PM_OUTSHUFFLE, // Transpose - PM_MERGE_GCONV_COUNT, // MergeGConv - PM_AXES, // Squeeze, Unsqueeze, Reduce* - PM_KEEP_DIMS, // Reduce* - PM_EPSILON, // BatchNorm - PM_REPARTITION_DIM, // Repartition - PM_REPARTITION_DEGREE, // Repartition - PM_REPLICATE_DIM, // Replicate - PM_REPLICATE_DEGREE, // Replicate - PM_COMBINE_DIM, // Combine - PM_COMBINE_DEGREE, // Combine - PM_REDUCTION_DIM, // Reduction - PM_REDUCTION_DEGREE, // Reduction - PM_ALLREDUCE_DIM, // AllReduce - PM_SOFTMAX_DIM, // Softmax - PM_NUM_HEADS, // MultiHeadAttention + PM_OP_TYPE, // AnyOp + PM_NUM_INPUTS, // AnyOp + PM_NUM_OUTPUTS, // AnyOp + PM_GROUP, // Conv2D + PM_KERNEL_H, // Conv2D, Pool2D + PM_KERNEL_W, // Conv2D, Pool2D + PM_STRIDE_H, // Conv2D, Pool2D + PM_STRIDE_W, // Conv2D, Pool2D + PM_PADDING_H, // Conv2D, Pool2D + PM_PADDING_W, // Conv2D, Pool2D + PM_ACTI, // Conv2D, Pool2D + PM_NUMDIM, // Concat, Transpose + PM_AXIS, // Concat, Split + PM_PERM, // Transpose + PM_OUTSHUFFLE, // Transpose + PM_MERGE_GCONV_COUNT, // MergeGConv + PM_AXES, // Squeeze, Unsqueeze, Reduce* + PM_KEEP_DIMS, // Reduce* + PM_EPSILON, // BatchNorm + PM_REPARTITION_DIM, // Repartition + PM_REPARTITION_DEGREE, // Repartition + PM_REPLICATE_DIM, // Replicate + PM_REPLICATE_DEGREE, // Replicate + PM_COMBINE_DIM, // Combine + PM_COMBINE_DEGREE, // Combine + PM_REDUCTION_DIM, // Reduction + PM_REDUCTION_DEGREE, // Reduction + PM_ALLREDUCE_DIM, // AllReduce + PM_PARALLEL_IDENTITY_DIM, // AllReduce + PM_SOFTMAX_DIM, // Softmax + PM_NUM_HEADS, // MultiHeadAttention PM_INVALID, PM_PARALLEL_DIM, PM_PARALLEL_DEGREE, @@ -268,5 +283,7 @@ enum { TENSOR_GUID_LAST_VALID = 3999999, PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000, NODE_GUID_FIRST_VALID = 5000000, + PEFT_MODEL_ID_FIRST_VALID = 6000000, + PEFT_MODEL_ID_LAST_VALID = 6999999 }; #endif // _FLEXFLOW_CONST_H_ diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 1cd90fda26..3e482b8d67 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -3,6 +3,8 @@ #include "flexflow/ffconst.h" #include +#include +#include namespace FlexFlow { @@ -18,6 +20,29 @@ class LayerID { size_t id, transformer_layer_id, model_id; }; +class PEFTModelID { +public: + static const PEFTModelID NO_ID; + PEFTModelID(); + PEFTModelID(size_t id); + bool is_valid_id() const; + friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); + friend std::ostream &operator<<(std::ostream &os, + PEFTModelID const &peft_model_id); + +public: + size_t id; +}; + }; // namespace FlexFlow +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::PEFTModelID const &n) const { + return n.id; + } +}; +} // namespace std + #endif // _FF_TYPE_H diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 0b74b7fce4..52b4b3d362 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -55,6 +55,11 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t); FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t); FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t); +// FF_NEW_OPAQUE_TYPE(flexflow_lora_optimizer_config_t); +// FF_NEW_OPAQUE_TYPE(flexflow_lora_sgd_optimizer_config_t); +// FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t); +FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t); // ----------------------------------------------------------------------- // FFConfig @@ -270,6 +275,7 @@ flexflow_tensor_t * bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name); flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( @@ -281,6 +287,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name); flexflow_tensor_t @@ -565,6 +572,7 @@ flexflow_tensor_t * const flexflow_tensor_t input2_, float eps, int dim, + bool inplace_residual, char const *name); flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_, @@ -590,6 +598,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, bool beam_search, char const *name); +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, flexflow_sgd_optimizer_t optimizer); @@ -613,11 +624,16 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id); void flexflow_model_generate(flexflow_model_t handle_, int num_requests, - char const **input_text, - int max_num_chars, - char **output_text, - int max_seq_length, - int **output_length_and_tokens); + enum RequestType *request_types, + char const **input_texts, + char **output_texts, + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps, + int **output_length_and_tokens, + int *num_finetuning_losses, + float *finetuning_losses); void flexflow_model_set_position_offset(flexflow_model_t handle, int offset); @@ -978,6 +994,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num( void flexflow_request_manager_set_max_sequence_length( flexflow_request_manager_t handle_, int max_seq_length); +void flexflow_request_manager_set_enable_peft_finetuning( + flexflow_request_manager_t handle_, bool enable_peft_finetuning_); + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, @@ -1036,6 +1055,113 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_); void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_); +// // ----------------------------------------------------------------------- +// // LoraSGDOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_sgd_optimizer_config_t +// flexflow_lora_sgd_optimizer_config_create( +// double lr, double momentum, bool nesterov, bool weight_decay); + +// void flexflow_lora_sgd_optimizer_config_destroy( +// flexflow_lora_sgd_optimizer_config_t handle_); + +// // ----------------------------------------------------------------------- +// // LoraAdamOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_adam_optimizer_config_t +// flexflow_lora_adam_optimizer_config_create(double alpha, +// double beta1, +// double beta2, +// double weight_decay, +// double epsilon); + +// void flexflow_lora_adam_optimizer_config_destroy( +// flexflow_lora_adam_optimizer_config_t handle_); + +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_, + bool trainable, + bool init_lora_weights, + char const *base_model_name_or_path, + char const *precision, + int rank, + float lora_alpha, + float lora_dropout, + int num_target_modules, + char const **target_modules_, + enum OptimizerType optimizer_type, + float sgd_learning_rate, + float sgd_momentum, + bool sgd_nesterov, + float sgd_weight_decay, + float adam_alpha, + float adam_beta1, + float adam_beta2, + float adam_weight_decay, + float adam_epsilon); + +void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_); + +char const *flexflow_lora_linear_config_get_cache_folder( + flexflow_lora_linear_config_t handle_); + +char const *flexflow_lora_linear_config_get_peft_model_id( + flexflow_lora_linear_config_t handle_); + +int flexflow_lora_linear_config_get_rank(flexflow_lora_linear_config_t handle_); + +float flexflow_lora_linear_config_get_lora_alpha( + flexflow_lora_linear_config_t handle_); + +float flexflow_lora_linear_config_get_lora_dropout( + flexflow_lora_linear_config_t handle_); + +bool flexflow_lora_linear_config_get_trainable( + flexflow_lora_linear_config_t handle_); + +bool flexflow_lora_linear_config_get_init_lora_weights( + flexflow_lora_linear_config_t handle_); + +char const **flexflow_lora_linear_config_get_target_modules( + flexflow_lora_linear_config_t handle_, int *num_target_modules); + +char const *flexflow_lora_linear_config_get_base_model_name_or_path( + flexflow_lora_linear_config_t handle_); + +char const *flexflow_lora_linear_config_get_precision( + flexflow_lora_linear_config_t handle_); + +void flexflow_lora_linear_config_set_lora_alpha( + flexflow_lora_linear_config_t handle_, float value); + +void flexflow_lora_linear_config_set_lora_dropout( + flexflow_lora_linear_config_t handle_, float value); + +void flexflow_lora_linear_config_set_trainable( + flexflow_lora_linear_config_t handle_, bool value); + +void flexflow_lora_linear_config_set_init_lora_weights( + flexflow_lora_linear_config_t handle_, bool value); + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create(); + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id); + +flexflow_peft_model_id_t flexflow_peft_model_id_no_id(); + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_); + #ifdef __cplusplus } #endif diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index f24a797ffd..ba4101c173 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -40,6 +40,7 @@ struct GenerationResult { std::string output_text; std::vector input_tokens; std::vector output_tokens; + std::vector finetuning_losses; }; #include diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h index 69a57e4e1c..c3dbcac422 100644 --- a/include/flexflow/layer.h +++ b/include/flexflow/layer.h @@ -49,7 +49,7 @@ class Layer { Tensor outputs[MAX_NUM_OUTPUTS]; Tensor inputs[MAX_NUM_INPUTS]; Tensor weights[MAX_NUM_WEIGHTS]; - bool trainableInputs[MAX_NUM_INPUTS]; + // bool trainable_inputs[MAX_NUM_INPUTS]; int numInputs, numWeights, numOutputs; bool profiling; bool inference_debugging; diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 6dda67bbfe..4ad735ef7d 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -108,19 +108,31 @@ enum TaskIDs { LAYERNORM_FWD_TASK_ID, LAYERNORM_INF_TASK_ID, LAYERNORM_BWD_TASK_ID, + LAYERNORM_PEFT_BWD_TASK_ID, RESIDUAL_LAYERNORM_INIT_TASK_ID, RESIDUAL_LAYERNORM_INF_TASK_ID, + RESIDUAL_LAYERNORM_BWD_TASK_ID, + RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, SIGMOID_SILU_MULTI_INIT_TASK_ID, SIGMOID_SILU_MULTI_INF_TASK_ID, + SIGMOID_SILU_MULTI_BWD_TASK_ID, + SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, LINEAR_INIT_TASK_ID, LINEAR_INIT_PARA_TASK_ID, LINEAR_INF_TASK_ID, + LINEAR_PEFT_BWD_TASK_ID, LINEAR_FWD_TASK_ID, LINEAR_BWD_TASK_ID, LINEAR_BWD2_TASK_ID, LINEAR_UPD_TASK_ID, + LORA_LINEAR_INIT_TASK_ID, + LORA_LINEAR_REG_TASK_ID, + LORA_LINEAR_INF_TASK_ID, + LORA_LINEAR_PEFT_BWD_TASK_ID, FLAT_INIT_TASK_ID, FLAT_FWD_TASK_ID, FLAT_BWD_TASK_ID, @@ -128,6 +140,7 @@ enum TaskIDs { SOFTMAX_FWD_TASK_ID, SOFTMAX_BWD_TASK_ID, SOFTMAX_INF_TASK_ID, + SOFTMAX_PEFT_BWD_TASK_ID, CONCAT_INIT_TASK_ID, CONCAT_FWD_TASK_ID, CONCAT_BWD_TASK_ID, @@ -163,20 +176,26 @@ enum TaskIDs { RMSNORM_INIT_TASK_ID, RMSNORM_FWD_TASK_ID, RMSNORM_INF_TASK_ID, + RMSNORM_BWD_TASK_ID, + RMSNORM_PEFT_BWD_TASK_ID, RESIDUAL_RMSNORM_INIT_TASK_ID, RESIDUAL_RMSNORM_INF_TASK_ID, + RESIDUAL_RMSNORM_BWD_TASK_ID, + RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, BEAM_TOPK_INIT_TASK_ID, BEAM_TOPK_INF_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID, INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, + INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID, TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID, MSELOSS_BWD_TASK_ID, FUSEDOP_INIT_TASK_ID, + FUSEDOP_PEFT_BWD_TASK_ID, FUSEDOP_FWD_TASK_ID, FUSEDOP_BWD_TASK_ID, FUSEDOP_INF_TASK_ID, @@ -224,10 +243,13 @@ enum TaskIDs { REPARTITION_BWD_TASK_ID, COMBINE_INIT_TASK_ID, COMBINE_FWD_TASK_ID, + COMBINE_INF_TASK_ID, COMBINE_BWD_TASK_ID, + COMBINE_PEFT_BWD_TASK_ID, REPLICATE_INIT_TASK_ID, REPLICATE_FWD_TASK_ID, REPLICATE_BWD_TASK_ID, + REPLICATE_PEFT_BWD_TASK_ID, REDUCTION_INIT_TASK_ID, REDUCTION_FWD_TASK_ID, REDUCTION_BWD_TASK_ID, @@ -235,9 +257,15 @@ enum TaskIDs { PIPELINE_FWD_TASK_ID, PIPELINE_BWD_TASK_ID, ALLREDUCE_INIT_TASK_ID, - ALLREDUCE_INF_TASK_ID, ALLREDUCE_FWD_TASK_ID, ALLREDUCE_BWD_TASK_ID, + ALLREDUCE_INF_TASK_ID, + ALLREDUCE_PEFT_BWD_TASK_ID, + PARALLEL_IDENTITY_INIT_TASK_ID, + PARALLEL_IDENTITY_FWD_TASK_ID, + PARALLEL_IDENTITY_BWD_TASK_ID, + PARALLEL_IDENTITY_INF_TASK_ID, + PARALLEL_IDENTITY_PEFT_BWD_TASK_ID, FUSED_PARALLELOP_INIT_TASK_ID, FUSED_PARALLELOP_FWD_TASK_ID, FUSED_PARALLELOP_BWD_TASK_ID, @@ -327,6 +355,7 @@ class ResidualLayerNorm; class AddBiasResidualLayerNorm; class SigmoidSiluMulti; class Linear; +class LoraLinear; class MultiHeadAttention; class IncMultiHeadSelfAttention; class TreeIncMultiHeadSelfAttention; @@ -349,9 +378,12 @@ class Repartition; class Reduction; class Replicate; class AllReduce; +class ParallelIdentity; class FusedParallelOp; class ParallelOpInfo; +struct Request; + // TODO: Move to an appropriate place /* This is used to create a type that recursively replaces value type @@ -561,6 +593,7 @@ class FFModel { bool elementwise_affine, float eps, bool use_bias = true, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a add_bias_residual_layer_norm layer @@ -571,6 +604,7 @@ class FFModel { bool elementwise_affine, float eps, bool use_bias = true, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a sigmoid_silu_multi layer @@ -599,6 +633,7 @@ class FFModel { Tensor *outputs, float eps, int dim, + bool inplace_residual = false, DataType data_type = DT_NONE, char const *name = NULL); // Add a beam search top k layer @@ -808,10 +843,13 @@ class FFModel { bool position_bias = false, char const *name = NULL); // ======================================== + // PEFT Layers + // ======================================== + PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); + // ======================================== // Inference APIs // ======================================== - std::vector generate(std::vector &prompts, - int max_seq_length); + std::vector generate(std::vector const &requests); Tensor create_tensor_legion_ordering(int num_dim, int const dims[], @@ -1103,6 +1141,9 @@ class FFModel { Legion::IndexSpace get_task_is(Legion::Domain const &domain) const; Legion::IndexSpace get_task_is(ParallelConfig const &pc) const; Legion::IndexSpace get_task_is(MachineView const &view) const; + bool need_to_add_combine(int layer_idx) const; + bool need_to_add_allreduce(int layer_idx) const; + bool need_to_add_parallel_identity(int layer_idx) const; bool is_mlp_block(int layer_idx) const; void create_operators_from_layers(); Op *create_operator_from_layer(Layer *layer, @@ -1117,7 +1158,7 @@ class FFModel { void clear_graph_search_cache(); public: - size_t op_global_guid, layer_global_guid; + size_t op_global_guid, layer_global_guid, peft_model_global_guid; size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid; size_t current_transformer_layer_id; // positional embedding start offset @@ -1137,6 +1178,12 @@ class FFModel { std::vector layers; std::vector operators; std::vector parameters; + // PEFT related + std::unordered_map base_layer_to_peft_layer; + std::unordered_map> peft_layer_to_peft_id; + std::unordered_map peft_configs; + // std::vector peft_operators; + FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; // Cached operators: key: operator hash, value: operator pointer @@ -1195,6 +1242,10 @@ class FFModel { SigmoidSiluMulti *>, std::unordered_map, Linear *>, + std::unordered_map< + std::pair, + LoraLinearParams>, + LoraLinear *>, std::unordered_map, Pool2D *>, std::unordered_map, std::unordered_map, AllReduce *>, + std::unordered_map, + ParallelIdentity *>, std::unordered_map, FusedParallelOp *>> cached_ops; diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h index 60785a1e29..d31c12b16c 100644 --- a/include/flexflow/op_meta.h +++ b/include/flexflow/op_meta.h @@ -9,7 +9,7 @@ class Op; class OpMeta { public: - OpMeta(FFHandler _handle); + // OpMeta(FFHandler _handle); OpMeta(FFHandler _handle, Op const *op); public: @@ -17,9 +17,11 @@ class OpMeta { bool profiling; // Measure the run time of the task bool inference_debugging; int decoding_step; + int bwd_step; char op_name[MAX_OPNAME]; LayerID layer_guid; - bool trainableInputs[MAX_NUM_INPUTS]; + bool trainable_inputs[MAX_NUM_INPUTS]; + bool reset_input_grads[MAX_NUM_INPUTS]; DataType input_type[MAX_NUM_INPUTS]; DataType weight_type[MAX_NUM_WEIGHTS]; DataType output_type[MAX_NUM_OUTPUTS]; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 311699d926..1a5af67b36 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -7,7 +7,9 @@ #include "flexflow/machine_view.h" #include "flexflow/parallel_tensor.h" #include "flexflow/utils/dot/record_formatter.h" +#include #include +namespace fs = std::filesystem; #include #include @@ -29,6 +31,11 @@ enum class MappingRecordType { INPUT_OUTPUT, INPUT_WEIGHT }; enum class MappingOperation { PARTITION, REPLICATE }; +fs::path get_dst_folder(std::string const &subdir, + int step_idx = 0, + int shard_idx = 0, + bool before_kernel = false); + /** @brief A class to keep track of a dimension relation between two tensors * used by an operator. * @@ -236,11 +243,18 @@ class Op { Legion::FutureMap empty_map; return empty_map; }; + virtual Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) { + assert(false); + } virtual void print_layer(FFModel const &model) = 0; template static std::string get_op_name_without_uid(OpMetaType *m) { std::string op_name_without_uid = std::string(m->op_name); - size_t last_underscore = op_name_without_uid.length() - 1; + size_t last_underscore = op_name_without_uid.length(); for (int i = op_name_without_uid.length() - 1; i > 0; i--) { if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) { break; @@ -248,7 +262,9 @@ class Op { last_underscore = i; } } - op_name_without_uid.erase(last_underscore); + if (last_underscore < op_name_without_uid.length()) { + op_name_without_uid.erase(last_underscore); + } return op_name_without_uid; } template @@ -259,31 +275,42 @@ class Op { std::vector input_tensors, std::vector weight_tensors, std::vector output_tensors, + bool fwd_pass = true, bool before_kernel = false) { - // Check if output directory exists, and create it if it does not - char const *folder_path = "./inference_tensors"; - struct stat st = {0}; - if (stat(folder_path, &st) == -1) { - // Directory does not exist, create it - mkdir(folder_path, 0700); - } - // output base filepath, shared by all tensors from the same operator + // get operator name and print it std::string op_name_without_uid = get_op_name_without_uid(m); - std::string base_filepath = - "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) + - "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" + - std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" + - op_name_without_uid + "_shard-id_" + std::to_string(shard_id); - if (before_kernel) { - base_filepath += "_pre"; + std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid + << std::endl; + // build the path to save the tensor + fs::path dst_filepath; + if (fwd_pass) { + dst_filepath = + get_dst_folder("fwd", m->decoding_step, shard_id, before_kernel); + } else { + dst_filepath = + get_dst_folder("bwd", m->bwd_step, shard_id, before_kernel); + } + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + // save batch config, if passed if (bc != nullptr) { - bc->save_to_file(base_filepath + "_batch-config"); + bc->save_to_file(dst_filepath.string() + ".batch_config"); } + // save all inputs for (int i = 0; i < input_tensors.size(); i++) { - std::string filename = base_filepath + "_input_" + std::to_string(i); + std::string filename = dst_filepath.string() + ".input_"; + if (fwd_pass) { + filename += std::to_string(i); + } else { + filename += "gradient_" + std::to_string(i); + } if (input_tensors[i].data_type == DT_FLOAT) { save_tensor(input_tensors[i].get_float_ptr(), input_tensors[i].domain.get_volume(), @@ -304,10 +331,17 @@ class Op { assert(false && "Tensor data type not supported"); } } - // only dump the weights once - if (m->decoding_step == 0) { + + // only dump the weights in the forward pass, at the first step + // note that we do not save the weight gradients, since we only support + // finetuning LoRA weights, which are not FF tensors. + if (fwd_pass && m->decoding_step == 0) { + fs::path dst_filepath_weights = + get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) / + layername; for (int i = 0; i < weight_tensors.size(); i++) { - std::string filename = base_filepath + "_weight_" + std::to_string(i); + std::string filename = + dst_filepath_weights.string() + ".weight_" + std::to_string(i); if (weight_tensors[i].data_type == DT_FLOAT) { save_tensor(weight_tensors[i].get_float_ptr(), weight_tensors[i].domain.get_volume(), @@ -329,9 +363,15 @@ class Op { } } } + // save all outputs for (int i = 0; i < output_tensors.size(); i++) { - std::string filename = base_filepath + "_output_" + std::to_string(i); + std::string filename = dst_filepath.string() + ".output_"; + if (fwd_pass) { + filename += std::to_string(i); + } else { + filename += "gradient_" + std::to_string(i); + } if (output_tensors[i].data_type == DT_FLOAT) { save_tensor(output_tensors[i].get_float_ptr(), output_tensors[i].domain.get_volume(), @@ -354,7 +394,11 @@ class Op { } // increase count of decoding steps if (!before_kernel) { - m->decoding_step++; + if (fwd_pass) { + m->decoding_step++; + } else { + m->bwd_step++; + } } } virtual bool measure_operator_cost(Simulator *sim, @@ -448,7 +492,8 @@ class Op { ParallelTensor outputs[MAX_NUM_OUTPUTS]; ParallelTensor inputs[MAX_NUM_INPUTS]; ParallelParameter weights[MAX_NUM_WEIGHTS]; - bool trainableInputs[MAX_NUM_INPUTS]; + bool trainable_inputs[MAX_NUM_INPUTS]; + bool reset_input_grads[MAX_NUM_INPUTS]; OpMeta *meta[MAX_NUM_WORKERS]; std::map inference_meta; int numInputs, numWeights, numOutputs; diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h index 5b187839ef..673f78ad46 100644 --- a/include/flexflow/operator_params.h +++ b/include/flexflow/operator_params.h @@ -23,6 +23,7 @@ #include "flexflow/ops/inc_multihead_self_attention_params.h" #include "flexflow/ops/layer_norm_params.h" #include "flexflow/ops/linear_params.h" +#include "flexflow/ops/lora_linear_params.h" #include "flexflow/ops/pool_2d_params.h" #include "flexflow/ops/reduce_params.h" #include "flexflow/ops/reshape_params.h" @@ -40,6 +41,7 @@ #include "flexflow/parallel_ops/allreduce_params.h" #include "flexflow/parallel_ops/combine_params.h" #include "flexflow/parallel_ops/fused_parallel_op_params.h" +#include "flexflow/parallel_ops/parallel_identity_params.h" #include "flexflow/parallel_ops/partition_params.h" #include "flexflow/parallel_ops/reduction_params.h" #include "flexflow/parallel_ops/replicate_params.h" @@ -67,6 +69,7 @@ using OperatorParameters = mp::variant; tl::optional get_op_parameters(Op const *op); diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h index bb470376c3..9510ac0f28 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm.h @@ -24,8 +24,10 @@ class AddBiasResidualLayerNorm : public Op { bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, @@ -38,6 +40,11 @@ class AddBiasResidualLayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -61,6 +68,14 @@ class AddBiasResidualLayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -76,21 +91,55 @@ class AddBiasResidualLayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta); + template + static void backward_kernel(AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + ffStream_t stream); + static void + backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + template + static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T const *gamma_ptr, + ffStream_t stream); + static void + peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; std::vector axes; }; @@ -105,8 +154,12 @@ class AddBiasResidualLayerNormMeta : public OpMeta { bool elementwise_affine, use_bias; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h index 87fe2fb562..840f521b01 100644 --- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h +++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h @@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams { bool elementwise_affine; float eps; bool use_bias; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid( std::pair const &) const; diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h index 3ba4f414d1..283e9a4290 100644 --- a/include/flexflow/ops/aggregate.h +++ b/include/flexflow/ops/aggregate.h @@ -11,9 +11,11 @@ namespace FlexFlow { #define AGGREGATE_MAX_BATCH_SIZE 64 #define AGGREGATE_MAX_N 128 +class Aggregate; + class AggregateMeta : public OpMeta { public: - AggregateMeta(FFHandler handle, int n); + AggregateMeta(FFHandler handle, Aggregate const *aggr); ~AggregateMeta(void); float **dev_exp_preds; float **dev_exp_grads; diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h index 4302dd0733..a9f651b620 100644 --- a/include/flexflow/ops/aggregate_spec.h +++ b/include/flexflow/ops/aggregate_spec.h @@ -11,9 +11,11 @@ namespace FlexFlow { #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32 #define AGGREGATE_SPEC_MAX_N 12 +class AggregateSpec; + class AggregateSpecMeta : public OpMeta { public: - AggregateSpecMeta(FFHandler handle, int n); + AggregateSpecMeta(FFHandler handle, AggregateSpec const *agg); ~AggregateSpecMeta(void); float **dev_region_ptrs; }; diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h index 298059e3ed..eca9943d20 100644 --- a/include/flexflow/ops/argmax.h +++ b/include/flexflow/ops/argmax.h @@ -17,6 +17,7 @@ class ArgMaxMeta : public OpMeta { size_t temp_storage_bytes = 0; int *d_offsets; void *d_out; + float *d_loss; Realm::RegionInstance reserveInst; ArgMaxMeta(FFHandler handler, Op const *op, @@ -89,18 +90,22 @@ class ArgMax : public Op { CostMetrics &cost_metrics) const override; template static void forward_kernel(ArgMaxMeta const *m, - DT *input_ptr, + BatchConfig const *bc, + DT const *input_ptr, int *indices_ptr, float *prob_ptr, int *parent_ptr, int length, int batch_size, + float *loss, ffStream_t stream); static void forward_kernel_wrapper(ArgMaxMeta const *m, - GenericTensorAccessorW const &input, + BatchConfig const *bc, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, - int batch_size); + int batch_size, + float *loss); Params get_params() const; public: diff --git a/include/flexflow/ops/cache.h b/include/flexflow/ops/cache.h index 1fbb1fa059..4f0b94ee5c 100644 --- a/include/flexflow/ops/cache.h +++ b/include/flexflow/ops/cache.h @@ -5,9 +5,11 @@ namespace FlexFlow { +class Cache; + class CacheMeta : public OpMeta { public: - CacheMeta(FFHandler handle); + CacheMeta(FFHandler handle, Cache const *c); float cache_score; }; diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h index ddef59549c..043b5d19a7 100644 --- a/include/flexflow/ops/element_unary.h +++ b/include/flexflow/ops/element_unary.h @@ -12,9 +12,11 @@ namespace FlexFlow { +class ElementUnary; + class ElementUnaryMeta : public OpMeta { public: - ElementUnaryMeta(FFHandler handle); + ElementUnaryMeta(FFHandler handle, ElementUnary const *unary); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor, outputTensor; cudnnActivationDescriptor_t actiDesc; diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h index ed89fcf37a..c90e1773e0 100644 --- a/include/flexflow/ops/embedding.h +++ b/include/flexflow/ops/embedding.h @@ -60,6 +60,11 @@ class Embedding : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; // void update(const FFModel&); void print_layer(FFModel const &model) override { assert(0); diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h index d68957d890..1ed4678a5b 100644 --- a/include/flexflow/ops/experts.h +++ b/include/flexflow/ops/experts.h @@ -6,20 +6,11 @@ namespace FlexFlow { +class Experts; + class ExpertsMeta : public OpMeta { public: - ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation); + ExpertsMeta(FFHandler handler, Experts const *e); ~ExpertsMeta(void); // Thrust helper arrays @@ -138,7 +129,7 @@ class Experts : public Op { float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim); diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h index a8326e9ab4..02ab1db7b5 100644 --- a/include/flexflow/ops/fused.h +++ b/include/flexflow/ops/fused.h @@ -49,6 +49,11 @@ class FusedOp : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -60,6 +65,10 @@ class FusedOp : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h index ec6cdfb9ab..73025216cd 100644 --- a/include/flexflow/ops/groupby.h +++ b/include/flexflow/ops/groupby.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Group_by; + class GroupByMeta : public OpMeta { public: - GroupByMeta(FFHandler handle, int n, float _alpha); + GroupByMeta(FFHandler handle, Group_by const *gb); ~GroupByMeta(void); float alpha; float **dev_region_ptrs; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 43dc527bc8..f77df7c456 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -96,6 +96,11 @@ class IncMultiHeadSelfAttention : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -109,17 +114,27 @@ class IncMultiHeadSelfAttention : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const override; - - static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m, + static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output, GenericTensorAccessorR const &bias); + static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &bias); Params get_params() const; public: @@ -204,6 +219,10 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { // typedef hipFloatComplex attFloatComplex; hipFloatComplex *complex_input; #endif + // PEFT specific fields + void *softmax_activation_buffer; + void *query_activation_buffer; + size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/batch_matmul_kernels.h b/include/flexflow/ops/kernels/batch_matmul_kernels.h index 4de774ee06..c3923c4d4b 100644 --- a/include/flexflow/ops/kernels/batch_matmul_kernels.h +++ b/include/flexflow/ops/kernels/batch_matmul_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class BatchMatmul; + class BatchMatmulMeta : public OpMeta { public: - BatchMatmulMeta(FFHandler handler); + BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm); int a_seq_length_dim, b_seq_length_dim; }; diff --git a/include/flexflow/ops/kernels/cast_kernels.h b/include/flexflow/ops/kernels/cast_kernels.h index 3001d913ca..d601601ea2 100644 --- a/include/flexflow/ops/kernels/cast_kernels.h +++ b/include/flexflow/ops/kernels/cast_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Cast; + class CastMeta : public OpMeta { public: - CastMeta(FFHandler handle); + CastMeta(FFHandler handle, Cast const *cast); DataType input_data_type, output_data_type; }; diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h index 4da6aaf5e2..4562ae871a 100644 --- a/include/flexflow/ops/kernels/concat_kernels.h +++ b/include/flexflow/ops/kernels/concat_kernels.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Concat; + class ConcatMeta : public OpMeta { public: - ConcatMeta(FFHandler handle) : OpMeta(handle){}; + ConcatMeta(FFHandler handle, Concat const *cc); int legion_axis; }; diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h index 7b2a0fe135..f83e4687d7 100644 --- a/include/flexflow/ops/kernels/conv_2d_kernels.h +++ b/include/flexflow/ops/kernels/conv_2d_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Conv2D; + class Conv2DMeta : public OpMeta { public: - Conv2DMeta(FFHandler handler); + Conv2DMeta(FFHandler handler, Conv2D const *conv); #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) cudnnTensorDescriptor_t inputTensor, biasTensor, outputTensor; cudnnFilterDescriptor_t filterDesc; diff --git a/include/flexflow/ops/kernels/flat_kernels.h b/include/flexflow/ops/kernels/flat_kernels.h index caf817512d..6aa5a13b42 100644 --- a/include/flexflow/ops/kernels/flat_kernels.h +++ b/include/flexflow/ops/kernels/flat_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Flat; + class FlatMeta : public OpMeta { public: - FlatMeta(FFHandler handle) : OpMeta(handle){}; + FlatMeta(FFHandler handle, Flat const *flat); }; namespace Kernels { diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh index d1e0e050b2..3d122d4bc5 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh @@ -385,6 +385,25 @@ inline __device__ void zero(T &dst) { dst = tmp.raw; } +template +__device__ __forceinline__ T WARP_SHFL(unsigned mask, T var, int srcLane, int width=warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_sync(mask, var, srcLane, width); +#else + return __shfl(var, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width=warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(mask, var, laneMask, width); +#else + return __shfl_xor(var, laneMask, width); +#endif +} + + template inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) { // use float32 to get better accuracy @@ -401,7 +420,7 @@ inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) { float qk = sum(qk_vec); #pragma unroll for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) { - qk += __shfl_xor_sync(uint32_t(-1), qk, mask); + qk += WARP_SHFL_XOR(uint32_t(-1), qk, mask); } return qk; } @@ -423,7 +442,7 @@ inline __device__ float block_sum(float *red_smem, float sum) { // Compute the sum per warp. #pragma unroll for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { - sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask); } // Warp leaders store the data to shared memory. @@ -442,11 +461,11 @@ inline __device__ float block_sum(float *red_smem, float sum) { // Parallel reduction inside the warp. #pragma unroll for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask); } // Broadcast to other threads. - return __shfl_sync(uint32_t(-1), sum, 0); + return WARP_SHFL(uint32_t(-1), sum, 0); } template diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index a5fdc7c602..90e50a0c9a 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -35,6 +35,9 @@ class LinearMeta : public OpMeta { float kernel_reg_lambda; bool use_bias, add_bias_only_once; Realm::RegionInstance reserveInst; + // PEFT related fields + void *output_activation_buffer; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { @@ -48,6 +51,23 @@ void forward_kernel_wrapper(LinearMeta const *m, int in_dim, int out_dim, int batch_size); +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *filter_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size); +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens); void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -73,6 +93,16 @@ void forward_kernel(LinearMeta const *m, int batch_size, ffStream_t stream); template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream); +template void backward_kernel(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -85,6 +115,7 @@ void backward_kernel(LinearMeta const *m, int out_dim, int batch_size, ffStream_t stream); + template __global__ void build_one_ptr(DT *one_ptr, int batch_size); } // namespace Internal diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h new file mode 100644 index 0000000000..5360b5f8ea --- /dev/null +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -0,0 +1,77 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H + +#include "flexflow/accessor.h" +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/ops/lora_linear.h" + +namespace FlexFlow { + +struct LoraLinearWeight { + // weights + void *w0_ptr, *w1_ptr; + // gradients + void *w0_grad_ptr, *w1_grad_ptr; + // v values for SGD optimizer (when using momentum) + void *w0_v_values_ptr, *w1_v_values_ptr; + int in_dim, out_dim, rank, num_shards; +}; + +struct LoraLinearModelState { + LoraLinearWeight weights; + LoraOptimizerConfig const *optimizer_config; + float lora_alpha; + std::string cache_folder; + // Huggingface model ID (for download and/or upload) + std::string peft_model_id; +}; + +class LoraLinearMeta : public OpMeta { +public: + LoraLinearMeta(FFHandler handle, LoraLinear const *li); + ~LoraLinearMeta(void); + // PEFT related fields + void *low_rank_activation; + void *input_activation; + std::unordered_map model_state; + size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; +}; + +namespace Kernels { +namespace LoraLinear { +void init_kernel_wrapper(LoraLinearMeta *m, int seed); +void inference_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +namespace Internal { +template +void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream); +template +void inference_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int in_dim, + int out_dim, + ffStream_t stream); +template +void peft_bwd_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int in_dim, + int out_dim, + ffStream_t stream); +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow +#endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h index 7f73a8295d..c5a954763e 100644 --- a/include/flexflow/ops/kernels/pool_2d_kernels.h +++ b/include/flexflow/ops/kernels/pool_2d_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Pool2D; + class Pool2DMeta : public OpMeta { public: - Pool2DMeta(FFHandler handle); + Pool2DMeta(FFHandler handle, Pool2D const *pool); ffTensorDescriptor_t inputTensor, outputTensor; ffActivationDescriptor_t actiDesc; ffPoolingDescriptor_t poolDesc; diff --git a/include/flexflow/ops/kernels/reshape_kernels.h b/include/flexflow/ops/kernels/reshape_kernels.h index e6c8c4d569..5b6fa5be19 100644 --- a/include/flexflow/ops/kernels/reshape_kernels.h +++ b/include/flexflow/ops/kernels/reshape_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Reshape; + class ReshapeMeta : public OpMeta { public: - ReshapeMeta(FFHandler handler); + ReshapeMeta(FFHandler handler, Reshape const *reshape); DataType data_type; }; @@ -44,4 +46,4 @@ void backward_kernel(T *input_grad_ptr, } // namespace Kernels } // namespace FlexFlow -#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H \ No newline at end of file +#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h index 0eef4ca72b..fd4e0ecf1d 100644 --- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_OPS_KERNELS_RESIDUAL_RMSNORM_KERNELS_H #include "flexflow/accessor.h" +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -31,13 +32,14 @@ class ResidualRMSNormMeta : public OpMeta { void *rms_ptr; void *norm_ptr; - float alpha; - float beta; - + bool inplace_residual; int in_dim; int batch_size; int num_elements; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { @@ -48,6 +50,28 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &residual_output, GenericTensorAccessorW const &output); +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output); +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad); +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, + GenericTensorAccessorR const &weight); } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h index 35c5aa69fa..475b6d94ed 100644 --- a/include/flexflow/ops/kernels/rms_norm_kernels.h +++ b/include/flexflow/ops/kernels/rms_norm_kernels.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H #include "flexflow/accessor.h" +#include "flexflow/batch_config.h" #include "flexflow/device.h" #include "flexflow/fftype.h" #include "flexflow/op_meta.h" @@ -31,13 +32,13 @@ class RMSNormMeta : public OpMeta { void *rms_ptr; void *norm_ptr; - float alpha; - float beta; - int in_dim; int batch_size; int num_elements; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; namespace Kernels { @@ -46,6 +47,22 @@ void forward_kernel_wrapper(RMSNormMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorR const &weight, GenericTensorAccessorW const &output); +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output); +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad); +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight); } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 8cfaf3c586..0b7f1090f6 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -23,20 +23,30 @@ class SoftmaxMeta : public OpMeta { bool profiling; bool inference_debugging; int dim; - DataType input_type, output_type; }; namespace Kernels { namespace Softmax { -template + void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr); -template + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + void backward_kernel_wrapper(SoftmaxMeta const *m, - DT *input_grad_ptr, - DT const *output_grad_ptr, - size_t num_elements); + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +void inference_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + bool is_last_op, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad); + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); namespace Internal { template @@ -46,10 +56,28 @@ void forward_kernel(SoftmaxMeta const *m, ffStream_t stream); template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, ffStream_t stream); + +template +void inference_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int num_classes, + ffStream_t stream); + +template +void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + ffStream_t stream); + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/include/flexflow/ops/kernels/transpose_kernels.h b/include/flexflow/ops/kernels/transpose_kernels.h index 7ff6163b30..a2c8ff0483 100644 --- a/include/flexflow/ops/kernels/transpose_kernels.h +++ b/include/flexflow/ops/kernels/transpose_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Transpose; + class TransposeMeta : public OpMeta { public: - TransposeMeta(FFHandler handler) : OpMeta(handler){}; + TransposeMeta(FFHandler handler, Transpose const *transpose); int num_dim; int perm[MAX_TENSOR_DIM]; }; diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h index 9e48d81190..b5e9538ea6 100644 --- a/include/flexflow/ops/layer_norm.h +++ b/include/flexflow/ops/layer_norm.h @@ -37,6 +37,11 @@ class LayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -67,6 +72,10 @@ class LayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -81,11 +90,6 @@ class LayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void forward_kernel_wrapper(LayerNormMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - GenericTensorAccessorR const &gamma, - GenericTensorAccessorR const &beta); template static void backward_kernel(LayerNormMeta const *m, T const *output_grad_ptr, @@ -96,13 +100,34 @@ class LayerNorm : public Op { T *beta_grad_ptr, ffStream_t stream); template + static void peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + ffStream_t stream); + + static void forward_kernel_wrapper(LayerNormMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); static void backward_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T const *input_ptr, - T *input_grad_ptr, - T const *gamma_ptr, - T *gamma_grad_ptr, - T *beta_grad_ptr); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + static void inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta); + static void peft_bwd_kernel_wrapper(LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias; @@ -124,6 +149,9 @@ class LayerNormMeta : public OpMeta { float eps; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h index a32df80537..ed2fad580f 100644 --- a/include/flexflow/ops/linear.h +++ b/include/flexflow/ops/linear.h @@ -52,6 +52,11 @@ class Linear : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override; bool get_int_parameter(PMParameter, int *) const override; static Op * @@ -66,6 +71,10 @@ class Linear : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h new file mode 100644 index 0000000000..9e83c3f90e --- /dev/null +++ b/include/flexflow/ops/lora_linear.h @@ -0,0 +1,99 @@ +#ifndef _FLEXFLOW_LORA_LINEAR_FIRST_H +#define _FLEXFLOW_LORA_LINEAR_FIRST_H + +#include "flexflow/inference.h" +#include "flexflow/node.h" +#include "flexflow/operator.h" +#include "flexflow/ops/lora_linear_params.h" +#include "flexflow/utils/memory_allocator.h" + +namespace FlexFlow { + +class FFModel; +class Layer; + +class LoraLinear : public Op { +public: + using Params = LoraLinearParams; + using Input = std::pair; + + LoraLinear( + FFModel &model, + LayerID const &layer_guid, + OperatorType type, + ParallelTensor const input, + ParallelTensor const output, + std::unordered_map const &_peft_configs, + char const *name = nullptr); + LoraLinear(FFModel &model, + LoraLinear const &other, + ParallelTensor const input, + ParallelTensor const output); + LoraLinear(FFModel &model, + Params const ¶ms, + Input const &inputs, + char const *name = nullptr); + + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void print_layer(FFModel const &model) override; + void map_output_tensors(FFModel &model) override; + static Op * + create_operator_from_layer(FFModel &model, + Layer const *layer, + std::vector const &inputs); + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + void serialize(Legion::Serializer &) const override; + static PCG::Node deserialize(FFModel &ff, + Legion::Deserializer &d, + ParallelTensor inputs[], + int num_inputs); + Op *materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const override; + // size_t get_params_hash() const override; + LoraLinearParams get_params() const; + + std::unordered_map peft_configs; +}; + +}; // namespace FlexFlow + +#endif // _FLEXLOW_LORA_LINEAR_FIRST_H diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h new file mode 100644 index 0000000000..70539271f2 --- /dev/null +++ b/include/flexflow/ops/lora_linear_params.h @@ -0,0 +1,150 @@ +#ifndef _FLEXFLOW_LORA_LINEAR_PARAMS_H +#define _FLEXFLOW_LORA_LINEAR_PARAMS_H + +#include "flexflow/ffconst.h" +#include "flexflow/fftype.h" +#include "flexflow/inference.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_tensor.h" +#include +#include +#include +#include + +namespace FlexFlow { + +class LoraOptimizerConfig { +public: + LoraOptimizerConfig(); + virtual ~LoraOptimizerConfig() {} +}; + +class LoraSGDOptimizerConfig : public LoraOptimizerConfig { +public: + LoraSGDOptimizerConfig(); + LoraSGDOptimizerConfig(double lr_, + double momentum_ = 0.0f, + bool nesterov_ = false, + bool weight_decay_ = 0.0f); + friend std::ostream &operator<<(std::ostream &os, + LoraSGDOptimizerConfig const &llc); + + NLOHMANN_DEFINE_TYPE_INTRUSIVE( + LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay) + +public: + double lr = 0.001f; + double momentum = 0.0f; + bool nesterov = false; + double weight_decay = 0.0f; +}; + +class LoraAdamOptimizerConfig : public LoraOptimizerConfig { +public: + LoraAdamOptimizerConfig(); + LoraAdamOptimizerConfig(double alpha_, + double beta1_ = 0.9f, + double beta2_ = 0.999f, + double weight_decay_ = 0.0f, + double epsilon_ = 1e-8); + friend std::ostream &operator<<(std::ostream &os, + LoraAdamOptimizerConfig const &llc); + + NLOHMANN_DEFINE_TYPE_INTRUSIVE( + LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon) + +public: + // Adam + double alpha = 0.001f; + double beta1 = 0.9f; + double beta2 = 0.999f; + double weight_decay = 0.0f; + double epsilon = 1e-8; +}; + +// Serialization helpers +template +void serialize_to_json_file(T const &obj, fs::path const &filepath); + +// Function to deserialize JSON from file and create object +template +std::unique_ptr deserialize_from_json_file(fs::path const &filepath); + +class LoraLinearConfig { +public: + static const LoraLinearConfig EmptyConfig; + LoraLinearConfig(std::string const &cache_folder_, + std::string const &peft_model_id_, + bool trainable_ = false, + LoraOptimizerConfig *optimizer_config_ = nullptr, + bool init_lora_weights_ = false, + std::string const &base_model_name_or_path_ = "", + std::string const &precision_ = "fp16", + int rank_ = 8, + float lora_alpha_ = 8.0f, + float lora_dropout_ = 0.0f, + std::vector const &target_modules_ = {}); + // constructor used to support std::unordered_map + LoraLinearConfig(); + friend bool operator==(LoraLinearConfig const &lhs, + LoraLinearConfig const &rhs); + friend std::ostream &operator<<(std::ostream &os, + LoraLinearConfig const &llc); + + NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig, + cache_folder, + peft_model_id, + rank, + lora_alpha, + lora_dropout, + target_modules, + trainable, + init_lora_weights, + base_model_name_or_path, + precision) + + std::string cache_folder; + // Huggingface model ID (for download and/or upload) + std::string peft_model_id; + // Lora parameters + int rank; + float lora_alpha; + float lora_dropout; + std::vector target_modules; + // Training parameters + // whether the weights are trainable (fine-tuning scenario) or not + // (inference-only). If set to true, allocate space for the gradients + bool trainable = false; + LoraOptimizerConfig *optimizer_config; + // whether to initialize weights randomly (instead of attempting to load them + // from file) + bool init_lora_weights; + // parameters only used to upload model after finetuning + std::string base_model_name_or_path; + std::string precision; +}; + +class LoraLinearParams { +public: + LayerID layer_guid; + OperatorType type; + std::unordered_map peft_configs; + char name[MAX_OPNAME]; + + bool is_valid(std::pair const + &input_shape) const; + friend bool operator==(LoraLinearParams const &lhs, + LoraLinearParams const &rhs); +}; + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::LoraLinearParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h index 0e9be82125..33a8e8be51 100644 --- a/include/flexflow/ops/residual_layer_norm.h +++ b/include/flexflow/ops/residual_layer_norm.h @@ -26,8 +26,10 @@ class ResidualLayerNorm : public Op { bool _elementwise_affine, bool _use_bias, float _eps, + bool inplace_residual, bool allocate_weights, char const *name); + void map_output_tensors(FFModel &ff) override; void init(FFModel const &) override; void init_inference(FFModel const &, std::vector const &, @@ -40,6 +42,11 @@ class ResidualLayerNorm : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -65,6 +72,14 @@ class ResidualLayerNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -78,7 +93,8 @@ class ResidualLayerNorm : public Op { T const *gamma_ptr, T const *beta_ptr, ffStream_t stream); - static void inference_kernel_wrapper(ResidualLayerNormMeta const *m, + static void inference_kernel_wrapper(ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -86,11 +102,30 @@ class ResidualLayerNorm : public Op { GenericTensorAccessorW &output, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta); + static void + backward_kernel_wrapper(ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + + static void + peft_bwd_kernel_wrapper(ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma); public: bool elementwise_affine, use_bias, use_two_residuals; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; std::vector axes; }; @@ -105,8 +140,12 @@ class ResidualLayerNormMeta : public OpMeta { bool elementwise_affine, use_bias, use_two_residuals; int64_t effective_batch_size, effective_num_elements; float eps; + bool inplace_residual; void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr; Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h index 949ae0c799..166d4b2b4e 100644 --- a/include/flexflow/ops/residual_layer_norm_params.h +++ b/include/flexflow/ops/residual_layer_norm_params.h @@ -13,6 +13,7 @@ struct ResidualLayerNormParams { float eps; bool use_bias; bool use_two_residuals; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid(std::tuple const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void print_layer(FFModel const &model) override { assert(0); } @@ -74,6 +81,14 @@ class ResidualRMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -82,6 +97,7 @@ class ResidualRMSNorm : public Op { float eps; int effective_batch_size; int dim, data_dim; + bool inplace_residual; }; } // namespace FlexFlow #endif // _FLEXFLOW_RESIDUAL_RMS_NORM_H diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h index a4e4de59ab..8b8f666dc1 100644 --- a/include/flexflow/ops/residual_rms_norm_params.h +++ b/include/flexflow/ops/residual_rms_norm_params.h @@ -11,6 +11,7 @@ struct ResidualRMSNormParams { LayerID layer_guid; float eps; int dim; + bool inplace_residual; char name[MAX_OPNAME]; bool is_valid( std::pair const &input) const; diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h index 1dc940ebd3..384404d8a0 100644 --- a/include/flexflow/ops/rms_norm.h +++ b/include/flexflow/ops/rms_norm.h @@ -34,6 +34,11 @@ class RMSNorm : public Op { void init(FFModel const &) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) override; void init_inference(FFModel const &, std::vector const &, std::vector const &, @@ -73,6 +78,14 @@ class RMSNorm : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h index 604438260a..ac60ff15dd 100644 --- a/include/flexflow/ops/sigmoid_silu_multi.h +++ b/include/flexflow/ops/sigmoid_silu_multi.h @@ -1,5 +1,6 @@ #pragma once +#include "flexflow/batch_config.h" #include "flexflow/inference.h" #include "flexflow/model.h" #include "flexflow/utils/memory_allocator.h" @@ -27,6 +28,11 @@ class SigmoidSiluMulti : public Op { MachineView const *mv = nullptr) override; void forward(FFModel const &) override; void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &, std::vector const &, @@ -55,6 +61,14 @@ class SigmoidSiluMulti : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; @@ -65,10 +79,24 @@ class SigmoidSiluMulti : public Op { T const *input2_ptr, T *output_ptr, ffStream_t stream); - static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m, + static void inference_kernel_wrapper(SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output); + static void + backward_kernel_wrapper(SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad); + static void + peft_bwd_kernel_wrapper(SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad); }; class SigmoidSiluMultiMeta : public OpMeta { @@ -80,6 +108,9 @@ class SigmoidSiluMultiMeta : public OpMeta { public: Realm::RegionInstance reserveInst; + // PEFT related fields + void *input_activation; + size_t allocated_peft_buffer_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h index 61094f7361..82aff53766 100644 --- a/include/flexflow/ops/softmax.h +++ b/include/flexflow/ops/softmax.h @@ -33,6 +33,11 @@ class Softmax : public Op { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; void print_layer(FFModel const &model) override { @@ -58,6 +63,10 @@ class Softmax : public Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h index 47144bf6d7..4b67692032 100644 --- a/include/flexflow/ops/topk.h +++ b/include/flexflow/ops/topk.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class TopK; + class TopKMeta : public OpMeta { public: - TopKMeta(FFHandler handle); + TopKMeta(FFHandler handle, TopK const *topk); bool sorted; }; diff --git a/include/flexflow/ops/transpose.h b/include/flexflow/ops/transpose.h index 3e6fb575c0..bca0b83460 100644 --- a/include/flexflow/ops/transpose.h +++ b/include/flexflow/ops/transpose.h @@ -6,6 +6,8 @@ namespace FlexFlow { +class TransposeMeta; + class Transpose : public Op { public: using Params = TransposeParams; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 02df0c0137..168ad5f618 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -144,7 +144,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { ~TreeIncMultiHeadSelfAttentionMeta(void); public: - int num_active_tokens; + int num_active_infr_tokens; Realm::RegionInstance committed_token_reserve_inst; TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos; bool *request_completed; diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h index 045f9b36a0..7e0e4362e2 100644 --- a/include/flexflow/parallel_ops/allreduce.h +++ b/include/flexflow/parallel_ops/allreduce.h @@ -34,12 +34,17 @@ class AllReduce : public ParallelOp { std::vector const &, MachineView const *mv = nullptr) override; void forward(FFModel const &) override; + void backward(FFModel const &) override; Legion::FutureMap inference(FFModel const &, BatchConfigFuture const &bc, std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; - void backward(FFModel const &) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( std::vector ¶llel_ops) const override; @@ -47,10 +52,6 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void inference_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime); static void forward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, @@ -59,6 +60,14 @@ class AllReduce : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); bool measure_operator_cost(Simulator *sim, MachineView const &pc, CostMetrics &cost_metrics) const override; diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h index 2e4fdb86a9..1db776f59d 100644 --- a/include/flexflow/parallel_ops/combine.h +++ b/include/flexflow/parallel_ops/combine.h @@ -40,6 +40,11 @@ class Combine : public ParallelOp { std::vector const &, std::vector const &, MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; void backward(FFModel const &) override; bool get_int_parameter(PMParameter, int *) const override; bool append_parallel_op_info( @@ -52,10 +57,18 @@ class Combine : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); template static void forward_task_with_type(Legion::Task const *task, diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h index bdf7aae501..a4ccbee8a5 100644 --- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h +++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h @@ -17,11 +17,6 @@ class AllReduceMeta : public OpMeta { namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); - void forward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); @@ -30,6 +25,15 @@ void backward_kernel_wrapper(AllReduceMeta const *m, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad); +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void peft_bwd_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); } // namespace AllReduce } // namespace Kernels } // namespace FlexFlow diff --git a/include/flexflow/parallel_ops/kernels/combine_kernels.h b/include/flexflow/parallel_ops/kernels/combine_kernels.h index 456013cd81..4b2227b178 100644 --- a/include/flexflow/parallel_ops/kernels/combine_kernels.h +++ b/include/flexflow/parallel_ops/kernels/combine_kernels.h @@ -8,9 +8,11 @@ namespace FlexFlow { +class Combine; + class CombineMeta : public OpMeta { public: - CombineMeta(FFHandler handle); + CombineMeta(FFHandler handle, Combine const *comb); DataType data_type; }; diff --git a/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h new file mode 100644 index 0000000000..fd6778a37f --- /dev/null +++ b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h @@ -0,0 +1,41 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H +#define _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H + +#include "flexflow/batch_config.h" +#include "flexflow/device.h" +#include "flexflow/fftype.h" +#include "flexflow/op_meta.h" +#include "flexflow/parallel_ops/parallel_identity.h" + +namespace FlexFlow { + +class ParallelIdentityMeta : public OpMeta { +public: + ParallelIdentityMeta(FFHandler handle, ParallelIdentity const *reduct); +}; + +namespace Kernels { +namespace ParallelIdentity { + +void forward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); + +void inference_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); +} // namespace ParallelIdentity +} // namespace Kernels +} // namespace FlexFlow + +#endif // _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H diff --git a/include/flexflow/parallel_ops/kernels/partition_kernels.h b/include/flexflow/parallel_ops/kernels/partition_kernels.h index 81b190603a..1e77090d11 100644 --- a/include/flexflow/parallel_ops/kernels/partition_kernels.h +++ b/include/flexflow/parallel_ops/kernels/partition_kernels.h @@ -7,9 +7,11 @@ namespace FlexFlow { +class Repartition; + class RepartitionMeta : public OpMeta { public: - RepartitionMeta(FFHandler handle); + RepartitionMeta(FFHandler handle, Repartition const *repart); DataType data_type; }; diff --git a/include/flexflow/parallel_ops/parallel_identity.h b/include/flexflow/parallel_ops/parallel_identity.h new file mode 100644 index 0000000000..b3ca789f08 --- /dev/null +++ b/include/flexflow/parallel_ops/parallel_identity.h @@ -0,0 +1,83 @@ +#ifndef _FLEXFLOW_PARALLEL_IDENTITY_H +#define _FLEXFLOW_PARALLEL_IDENTITY_H + +#include "flexflow/layer.h" +#include "flexflow/node.h" +#include "flexflow/op_meta.h" +#include "flexflow/operator.h" +#include "flexflow/parallel_ops/parallel_identity_params.h" +#include "parallel_op.h" + +namespace FlexFlow { + +class ParallelIdentity : public ParallelOp { +public: + using Params = ParallelIdentityParams; + using Input = ParallelTensor; + + ParallelIdentity(FFModel &model, + const ParallelTensor input, + int parallel_identity_legion_dim, + char const *name = NULL); + ParallelIdentity(FFModel &model, + Params const ¶ms, + Input const input, + char const *name = nullptr); + void create_input_partition(FFModel &model) override; + void create_input_partition_inference( + FFModel &model, + std::vector const &batch_inputs, + std::vector const &batch_outputs) override; + void init(FFModel const &) override; + void init_inference(FFModel const &, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + void forward(FFModel const &) override; + void backward(FFModel const &) override; + Legion::FutureMap inference(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; + bool get_int_parameter(PMParameter, int *) const override; + bool append_parallel_op_info( + std::vector ¶llel_ops) const override; + static OpMeta *init_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void forward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void backward_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void inference_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + bool measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const override; + + Params get_params() const; + +public: + int parallel_identity_dim; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_PARALLEL_IDENTITY_H diff --git a/include/flexflow/parallel_ops/parallel_identity_params.h b/include/flexflow/parallel_ops/parallel_identity_params.h new file mode 100644 index 0000000000..6eeed662ec --- /dev/null +++ b/include/flexflow/parallel_ops/parallel_identity_params.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H +#define _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H + +namespace FlexFlow { + +struct ParallelIdentityParams { + int parallel_identity_legion_dim; + char name[MAX_OPNAME]; + bool is_valid(ParallelTensorShape const &) const; +}; +bool operator==(ParallelIdentityParams const &, ParallelIdentityParams const &); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash { + size_t operator()(FlexFlow::ParallelIdentityParams const &) const; +}; +} // namespace std + +#endif // _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h index 0bf573996c..39324c2a51 100644 --- a/include/flexflow/parallel_ops/parallel_op.h +++ b/include/flexflow/parallel_ops/parallel_op.h @@ -41,7 +41,7 @@ class ParallelOp : public Op { public: Legion::LogicalPartition input_lp, output_grad_lp; std::unordered_map - inference_input_lps; + inference_input_lps, inference_output_grad_lps; }; }; // namespace FlexFlow diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h index 65d69d8564..c27616634f 100644 --- a/include/flexflow/parallel_ops/replicate.h +++ b/include/flexflow/parallel_ops/replicate.h @@ -54,10 +54,19 @@ class Replicate : public ParallelOp { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + Legion::FutureMap peft_bwd(FFModel const &, + BatchConfigFuture const &bc, + std::vector const &, + std::vector const &, + MachineView const *mv = nullptr) override; static void backward_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void peft_bwd_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); static void forward_kernel_wrapper(ReplicateMeta const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index a38a3b2671..f0fab957ee 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -39,6 +39,7 @@ class InferenceManager { Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc); Legion::FutureMap inference(FFModel *model, int index, BatchConfigFuture const &bc); + void peft_bwd(FFModel *model, int index, BatchConfigFuture const &bc); void load_input_tokens_from_batch_config(FFModel *model, BatchConfigFuture const &bc, ParallelTensor const input, @@ -65,15 +66,34 @@ struct Request { FINISHING = 104, // finishing request, but not yet verified }; BatchConfig::RequestGuid guid; - int max_sequence_length; + PEFTModelID peft_model_id = PEFTModelID::NO_ID; + int max_sequence_length = 128; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; Status status = PENDING; std::vector tokens; - + std::string prompt; std::vector beam_trees; + // PEFT field + RequestType req_type = REQ_INFERENCE; + size_t processed_finetuning_tokens = 0; + int completed_training_steps = 0; + int dataset_entry_processed_tokens = 0; + int max_training_steps = 1; + // how many gradient accumulation steps to do before updating the weights. if + // left as -1, it will be set to the number of entries in the dataset + int gradient_accumulation_steps = -1; + int benchmarking_tokens = -1; + std::vector finetuning_tokens_per_batch; + bool warmup = false; + std::string dataset_filepath; + std::vector, + std::vector>> + dataset; + std::vector finetuning_losses; + friend std::ostream &operator<<(std::ostream &os, Request const &req); }; // store the result of beam search @@ -120,6 +140,8 @@ class RequestManager { void set_max_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); int get_max_sequence_length(); + void set_enable_peft_finetuning(bool enable_peft_finetuning_); + static void set_inference_finished(bool finished = true); int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, int bos_token_id, @@ -143,10 +165,9 @@ class RequestManager { void serve_incr_decoding(FFModel *model); void serve_spec_infer(FFModel *model); GenerationResult get_generation_result(RequestGuid const &guid); - RequestGuid register_new_request(std::string const &prompt, - int max_sequence_length); - RequestGuid register_new_request(std::vector const &prompt, - int max_sequence_length); + RequestGuid register_new_request(Request const &request_); + RequestGuid register_new_peft_request(Request const &request_); + // Methods to start and terminate request manager's background task void start_background_server(FFModel *model); bool is_background_server_terminated(); @@ -156,6 +177,8 @@ class RequestManager { bool is_request_completed(RequestGuid const &guid); void trigger_request_completion_future(RequestGuid const &guid); // Methods for preparing next batches + bool check_inf_req_completion(BatchConfig const &old_bc, int i); + void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, @@ -265,6 +288,10 @@ class RequestManager { int max_sequence_length; Status request_manager_status; + // peft benchmarking + bool enable_peft_finetuning = false; + static bool inference_finished; + // tree width in each speculative step, if not specified 1 std::vector spec_infer_tree_width; @@ -275,7 +302,8 @@ class RequestManager { int bos_token_id; int eos_token_id; std::string output_filepath; - std::queue pending_request_queue; + std::queue pending_infr_request_queue; + std::queue pending_peft_request_queue; std::unordered_map all_requests; std::unordered_map request_generation_results; std::mutex request_queue_mutex; @@ -304,6 +332,8 @@ class RequestManager { int llm_decoding_steps; int ssm_decoding_steps; double start_time, finish_time; + double registration_time, first_token_time; + bool first_token_time_set = false; }; std::unordered_map profiling_requests; double total_request_run_time; diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h index e410f66325..6cda96aa8b 100644 --- a/include/flexflow/simulator.h +++ b/include/flexflow/simulator.h @@ -33,21 +33,21 @@ namespace FlexFlow { #define MOD(a, b) ((a) % (b)) < 0 ? ((a) % (b)) + (b) : ((a) % (b)) -class Conv2DMeta; -class LinearMeta; -class Pool2DMeta; -class ElementUnaryMeta; -class ElementBinaryMeta; -class LayerNormMeta; -// class EmbeddingMeta; -// class SoftmaxMeta; -class BatchMatmulMeta; -// class BatchNormMeta; -class ConcatMeta; -// class DropoutMeta; -class TransposeMeta; -class Op; -class FFModel; +// class Conv2DMeta; +// class LinearMeta; +// class Pool2DMeta; +// class ElementUnaryMeta; +// class ElementBinaryMeta; +// class LayerNormMeta; +// class EmbeddingMeta; +// class SoftmaxMeta; +// class BatchMatmulMeta; +// class BatchNormMeta; +// class ConcatMeta; +// class DropoutMeta; +// class TransposeMeta; +// class Op; +// class FFModel; /** * @brief Costs of an operator. @@ -751,19 +751,19 @@ class Simulator { strict_hash_to_operator_cost; public: - Conv2DMeta *conv2d_meta; - LinearMeta *linear_meta; - Pool2DMeta *pool2d_meta; - ElementUnaryMeta *ele_unary_meta; - LayerNormMeta *layernorm_meta; - // ElementBinaryMeta *ele_binary_meta; - // EmbeddingMeta *embedding_meta; - // SoftmaxMeta *softmax_meta; - BatchMatmulMeta *batch_matmul_meta; - // BatchNormMeta *batch_norm_meta; - ConcatMeta *concat_meta; - // DropoutMeta *dropout_meta; - TransposeMeta *transpose_meta; + // Conv2DMeta *conv2d_meta; + // LinearMeta *linear_meta; + // Pool2DMeta *pool2d_meta; + // ElementUnaryMeta *ele_unary_meta; + // LayerNormMeta *layernorm_meta; + // ElementBinaryMeta *ele_binary_meta; + // EmbeddingMeta *embedding_meta; + // SoftmaxMeta *softmax_meta; + // BatchMatmulMeta *batch_matmul_meta; + // BatchNormMeta *batch_norm_meta; + // ConcatMeta *concat_meta; + // DropoutMeta *dropout_meta; + // TransposeMeta *transpose_meta; int segment_size; int max_num_segments; // simulation could be slow if the number of segments // are too large diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h index f8bf67b3e1..486a65eb3d 100644 --- a/include/flexflow/utils/cuda_helper.h +++ b/include/flexflow/utils/cuda_helper.h @@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) { return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret; } -__global__ void - scale_kernel(float *ptr, Legion::coord_t size, float a, float b); +template +__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b); __global__ void ones_kernel(float *ptr, Legion::coord_t size); @@ -156,10 +156,13 @@ template void save_tensor(T const *ptr, size_t num_elements, char const *file_name); template -T *download_tensor(T const *ptr, size_t num_elements); +T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements); + +template +void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements); template -bool download_tensor(T const *ptr, T *dst, size_t num_elements); +void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements); cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor, Legion::Domain domain, @@ -179,3 +182,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type); cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type); cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type); #endif +void check_device_vs_host_ptr(void const *maybe_devicePtr); +void check_ptr_alignment(void const *ptr); diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h index 5d3c831d4f..805cc46b4c 100644 --- a/include/flexflow/utils/hip_helper.h +++ b/include/flexflow/utils/hip_helper.h @@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) { return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret; } -__global__ void - scale_kernel(float *ptr, Legion::coord_t size, float a, float b); +template +__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b); __global__ void ones_kernel(float *ptr, Legion::coord_t size); @@ -86,6 +86,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value); template __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size); +template +__global__ void copy_kernel_discrete(DT *dst, + const DT *src, + Legion::coord_t size, + size_t *index); + template __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size); @@ -135,16 +141,28 @@ __host__ void updateGAS(float *para_ptr, float learning_rate); template -void print_tensor(T const *ptr, size_t num_elements, char const *prefix); +void print_tensor(T const *ptr, + size_t num_elements, + char const *prefix, + int shard_id = 0); +template +void print_beam_tensor(T const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); template void save_tensor(T const *ptr, size_t num_elements, char const *file_name); template -T *download_tensor(T const *ptr, size_t num_elements); +T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements); + +template +void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements); template -bool download_tensor(T const *ptr, T *dst, size_t num_elements); +void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements); miopenStatus_t cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor, @@ -153,7 +171,8 @@ miopenStatus_t miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, - Legion::Domain domain); + Legion::Domain domain, + DataType data_type = DT_FLOAT); hipblasDatatype_t ff_to_cuda_datatype(DataType type); @@ -164,3 +183,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type); void handle_unimplemented_hip_kernel(OperatorType op_type); #endif +void check_device_vs_host_ptr(void const *maybe_devicePtr); +void check_ptr_alignment(void const *ptr); diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h index 7091b159b2..fad7630770 100644 --- a/include/flexflow/utils/memory_allocator.h +++ b/include/flexflow/utils/memory_allocator.h @@ -54,6 +54,11 @@ class MemoryAllocator { return static_cast
(ptr); } + inline void free_all() { + reserved_allocated_size = 0; + instance_allocated_size = 0; + } + public: Legion::Memory memory; void *reserved_ptr; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h new file mode 100644 index 0000000000..dae46a8af1 --- /dev/null +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -0,0 +1,92 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ +#define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ + +#include "flexflow/config.h" +#include + +namespace FlexFlow { + +class PEFTWeightAllocator { +public: + PEFTWeightAllocator(void *_base_ptr, size_t _total_size) + : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0), + local_offset(_total_size) {} + + inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id, + size_t datalen) { + const std::lock_guard lock(peft_weight_allocator_mutex); + void *ptr = static_cast(base_ptr) + sync_offset; + off_t model_sync_weights_offset = sync_offset; + size_t model_sync_weights_size = datalen; + if (sync_weights.find(peft_model_id) != sync_weights.end()) { + // Assert that sync weights for each PEFT model is consecutive + std::pair offset_and_size = sync_weights[peft_model_id]; + assert(sync_offset == offset_and_size.first + offset_and_size.second); + model_sync_weights_offset = offset_and_size.first; + model_sync_weights_size = offset_and_size.second + datalen; + } + sync_offset += datalen; + assert(sync_offset < local_offset); + sync_weights[peft_model_id] = + std::make_pair(model_sync_weights_offset, model_sync_weights_size); + return ptr; + } + + std::pair + get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) { + const std::lock_guard lock(peft_weight_allocator_mutex); + assert(sync_weights.find(peft_model_id) != sync_weights.end()); + std::pair offset_and_size = sync_weights[peft_model_id]; + return std::make_pair(static_cast(base_ptr) + offset_and_size.first, + offset_and_size.second); + } + + inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id, + size_t datalen) { + const std::lock_guard lock(peft_weight_allocator_mutex); + local_offset -= datalen; + assert(sync_offset < local_offset); + void *ptr = static_cast(base_ptr) + local_offset; + return ptr; + } + + template + inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id, + size_t count) { + return static_cast
( + allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count)); + } + + template + inline DT *allocate_local_weights(PEFTModelID const &peft_model_id, + size_t count) { + return static_cast
( + allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count)); + } + +public: + void *base_ptr; + size_t total_size; + off_t sync_offset, local_offset; + std::unordered_map> sync_weights; + std::mutex peft_weight_allocator_mutex; +}; + +}; // namespace FlexFlow + +#endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md deleted file mode 100644 index d78fb37be9..0000000000 --- a/inference/MODEL_WEIGHTS.md +++ /dev/null @@ -1,28 +0,0 @@ -To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files. - -```python -from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") - -for name, params in model.named_parameters(): - for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) - params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name) -``` - diff --git a/inference/README.md b/inference/README.md new file mode 100644 index 0000000000..14c94e22ac --- /dev/null +++ b/inference/README.md @@ -0,0 +1,42 @@ +# Inference Examples +This folder contains the code to run inference examples in FlexFlow + +To create a sample prompt, call (from the `build` folder): + +```bash +mkdir -p ../inference/prompt +echo '["San Francisco is a "]' > ../inference/prompt/test.json +``` + +To download a model for use in C++, call: +```bash +huggingface-cli login # if needed +python ../inference/utils/download_hf_model.py meta-llama/Llama-2-7b-hf --half-precision-only +``` + +To run the incremental decoding example in C++, call: + +```bash +./inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 +``` + +To run the speculative inference example in C++, call: + +```bash +./inference/spec_infer/spec_infer -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4 +``` + +To run a PEFT model example in C++, call: + +```bash +./inference/peft/peft \ + -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 4 \ + -ll:fsize 8192 -ll:zsize 12000 \ + -llm-model JackFram/llama-160m \ + -finetuning-dataset ../inference/prompt/peft_dataset.json \ + -peft-model goliaro/llama-160m-lora \ + -enable-peft \ + --use-full-precision \ + --inference-debugging +``` \ No newline at end of file diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index ec3dda3158..c9ffff5c07 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -264,15 +264,18 @@ void FlexFlow::top_level_task(Task const *task, /*parser_callback_t */ nullptr, /*allow_exceptions */ true, /*ignore_comments */ true); - std::vector prompts; + + std::vector requests; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + requests.push_back(inference_req); total_num_requests++; - prompts.push_back(text); } - std::vector result = - model.generate(prompts, 128 /*max_sequence_length*/); + std::vector result = model.generate(requests); } // terminate the request manager by stopping the background thread diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index a529411ddb..195d6ba7e3 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff, falcon_config.layer_norm_epsilon, true, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_layer_norm( @@ -89,8 +89,9 @@ void FALCON::create_falcon_model(FFModel &ff, true, falcon_config.layer_norm_epsilon, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_input_layernorm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = res_ln_outputs[0]; att_norm = res_ln_outputs[1]; @@ -116,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -141,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -166,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); break; @@ -187,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h") + std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h") .c_str()); dense_h_to_4h = ff.gelu(dense_h_to_4h); @@ -203,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h") + std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h") .c_str()); } // final normalization and linear @@ -216,6 +217,7 @@ void FALCON::create_falcon_model(FFModel &ff, true, falcon_config.layer_norm_epsilon, true, + false, DT_NONE, "ln_f"); Tensor ln_f = res_ln_outputs[1]; diff --git a/inference/models/llama.cc b/inference/models/llama.cc index 517f534438..cf26194597 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "tok_embeddings"); + "embed_tokens"); Tensor w2 = nullptr; @@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff, llama_config.rms_norm_eps, llama_config.hidden_size, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); } else { ff.residual_rms_norm( @@ -84,8 +84,9 @@ void LLAMA::create_llama_model(FFModel &ff, token_att_norm, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_norm") + std::string("layers." + std::to_string(i) + ".input_layernorm") .c_str()); token = token_att_norm[0]; att_norm = token_att_norm[1]; @@ -94,10 +95,11 @@ void LLAMA::create_llama_model(FFModel &ff, Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multihead_self_attention( + mha = ff.spec_inc_multiquery_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -111,16 +113,17 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; } case TREE_VERIFY_MODE: { - mha = ff.inc_multihead_self_attention_verify( + mha = ff.inc_multiquery_self_attention_verify( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -134,16 +137,17 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; } case INC_DECODING_MODE: { - mha = ff.inc_multihead_self_attention( + mha = ff.inc_multiquery_self_attention( att_norm, llama_config.hidden_size, llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ @@ -157,7 +161,7 @@ void LLAMA::create_llama_model(FFModel &ff, 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -175,54 +179,56 @@ void LLAMA::create_llama_model(FFModel &ff, token_ff_norm, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, - std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str()); + std::string("layers." + std::to_string(i) + ".post_attention_layernorm") + .c_str()); token = token_ff_norm[0]; Tensor ff_norm = token_ff_norm[1]; - Tensor w1 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w1") - .c_str()); + Tensor w1 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str()); - Tensor w3 = - ff.dense(ff_norm, - llama_config.intermediate_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w3") - .c_str()); + Tensor w3 = ff.dense( + ff_norm, + llama_config.intermediate_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str()); Tensor multi = ff.sigmoid_silu_multi(w1, w3); - w2 = - ff.dense(multi, - llama_config.hidden_size, - AC_MODE_NONE, - false, - DT_NONE, - nullptr, - nullptr, - nullptr, - REG_MODE_NONE, - 0.0f, - std::string("layers_" + std::to_string(i) + "_feed_forward_w2") - .c_str()); + w2 = ff.dense( + multi, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); + // Low-Rank Adapter (LoRA) for the second linear layer + // ff.lora_linear(std::string("down_proj"), std::string("layers." + + // std::to_string(i) + ".mlp.down_proj.lora").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; @@ -231,6 +237,7 @@ void LLAMA::create_llama_model(FFModel &ff, final_rms_norm_output, llama_config.rms_norm_eps, llama_config.hidden_size, + false, // inplace_residual DT_NONE, "norm"); @@ -244,7 +251,7 @@ void LLAMA::create_llama_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "output"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -261,7 +268,8 @@ void LLAMA::create_llama_model(FFModel &ff, output = ff.sampling(softmax, generation_config.topp); } else { // output = ff.arg_top_k(dense, /*k=*/1, false); - output = ff.argmax(dense, /*beam_Search*/ false); + Tensor softmax = ff.softmax(dense, -1); + output = ff.argmax(softmax, /*beam_Search*/ false); } } @@ -269,7 +277,7 @@ void LLAMA::create_llama_model(FFModel &ff, "", weight_file_path, llama_config.num_attention_heads, - llama_config.num_attention_heads, + llama_config.num_key_value_heads, llama_config.hidden_size, llama_config.hidden_size / llama_config.num_attention_heads, ff.config.tensor_parallelism_degree, diff --git a/inference/models/llama.h b/inference/models/llama.h index ba1f0236f9..edb78f1300 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -36,6 +36,11 @@ class LLAMA { num_hidden_layers = model_config["num_hidden_layers"]; vocab_size = model_config["vocab_size"]; num_attention_heads = model_config["num_attention_heads"]; + if (model_config.find("num_key_value_heads") != model_config.end()) { + num_key_value_heads = model_config["num_key_value_heads"]; + } else { + num_key_value_heads = num_attention_heads; + } hidden_size = model_config["hidden_size"]; rms_norm_eps = model_config["rms_norm_eps"]; intermediate_size = model_config["intermediate_size"]; @@ -61,6 +66,8 @@ class LLAMA { std::cout << "\tvocab_size: " << vocab_size << std::endl; std::cout << "\tnum_attention_heads: " << num_attention_heads << std::endl; + std::cout << "\tnum_key_value_heads: " << num_key_value_heads + << std::endl; std::cout << "\thidden_size: " << hidden_size << std::endl; std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl; std::cout << "\tintermediate_size: " << intermediate_size << std::endl; @@ -73,8 +80,8 @@ class LLAMA { // int max_seq_len, max_num_tokens; int max_beam_width, max_beam_depth; - int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, - intermediate_size; + int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads, + hidden_size, intermediate_size; float rms_norm_eps; }; diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index 70e2b5e9c5..e4a7e0056d 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff, use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor intermediate_output = nullptr, layernorm_output = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff, 1e-05, false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); } else { ff.residual_layer_norm( intermediate_output, @@ -86,8 +86,9 @@ void MPT::create_mpt_model(FFModel &ff, true, 1e-05, false, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_1").c_str()); + std::string("layers." + std::to_string(i) + ".norm_1").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; } @@ -113,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -137,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -161,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff, pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), /*qk_prod_scaling*/ false, /*position_bias*/ true, - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn") .c_str() /*name*/ ); break; @@ -181,8 +182,9 @@ void MPT::create_mpt_model(FFModel &ff, true, 1e-05, false, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_norm_2").c_str()); + std::string("layers." + std::to_string(i) + ".norm_2").c_str()); hidden_states = res_ln_outputs[0]; layernorm_output = res_ln_outputs[1]; @@ -198,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str()); layernorm_output = ff.gelu(layernorm_output); intermediate_output = ff.dense( layernorm_output, @@ -211,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str()); + std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str()); } // final @@ -224,8 +226,9 @@ void MPT::create_mpt_model(FFModel &ff, true, 1e-05, false, + false, DT_NONE, - "transformer_norm_f"); + "norm_f"); Tensor all_final_norm = res_ln_outputs[1]; Tensor lm_head = ff.dense(all_final_norm, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 5677d5658e..b3f2ef4e17 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -94,8 +94,9 @@ void OPT::create_opt_model(FFModel &ff, opt_config.layer_norm_elementwise_affine, 1e-05, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_attention_layer_norm") + std::string("layers." + std::to_string(i) + ".self_attn_layer_norm") .c_str()); Tensor residual = res_ln_outputs[0]; Tensor hidden_states = res_ln_outputs[1]; @@ -121,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -145,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -169,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff, -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); break; @@ -186,9 +187,10 @@ void OPT::create_opt_model(FFModel &ff, opt_config.layer_norm_elementwise_affine, 1e-05, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + - "_add_bias_residual_layer_norm") + std::string("layers." + std::to_string(i) + + ".add_bias_residual_layer_norm") .c_str()); added = res_ln_outputs[0]; Tensor final_norm = res_ln_outputs[1]; @@ -205,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc1").c_str()); + std::string("layers." + std::to_string(i) + ".fc1").c_str()); fc2 = ff.dense(fc1, opt_config.hidden_size, AC_MODE_NONE, @@ -216,7 +218,10 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_fc2").c_str()); + std::string("layers." + std::to_string(i) + ".fc2").c_str()); + // Low-Rank Adapter (LoRA) for the second linear layer + // ff.lora_linear(std::string("fc2"), std::string("layers." + + // std::to_string(i) + ".fc2.lora").c_str()); } // final @@ -229,6 +234,7 @@ void OPT::create_opt_model(FFModel &ff, opt_config.layer_norm_elementwise_affine, 1e-05, true, + false, DT_NONE, "final_layer_norm"); Tensor all_final_norm = res_ln_outputs[1]; @@ -243,7 +249,7 @@ void OPT::create_opt_model(FFModel &ff, nullptr, REG_MODE_NONE, 0.0f, - "embed_tokens_weight_lm_head"); + "lm_head"); Tensor output; if (mode == BEAM_SEARCH_MODE) { @@ -252,7 +258,8 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(softmax, /*beam_Search*/ true); } else { // output = ff.arg_top_k(lm_head, /*k=*/1, false); - output = ff.argmax(lm_head, /*beam_Search*/ false); + Tensor softmax = ff.softmax(lm_head, -1); + output = ff.argmax(softmax, /*beam_Search*/ false); } FileDataLoader *fileloader = new FileDataLoader( diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 8b0dc1098c..cd8bf3a9a7 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wte"); + "wte"); Tensor positional_embedding = ff.embedding(position_input, @@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model( use_full_precision ? DT_FLOAT : DT_HALF, NULL, embed_init, - "transformer_wpe"); + "wpe"); Tensor residual = nullptr, c_proj = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; @@ -96,8 +96,9 @@ void STARCODER::create_starcoder_model( true, startcoder_config.layer_norm_epsilon, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_1").c_str()); + std::string("layers." + std::to_string(i) + ".ln_1").c_str()); Tensor hidden_states = res_ln_outputs[0]; Tensor ln_1 = res_ln_outputs[1]; @@ -124,7 +125,7 @@ void STARCODER::create_starcoder_model( 1.0f, /*scaling factor*/ true, /*qk_prod_scaling*/ false, /*position_bias*/ - std::string("layers_" + std::to_string(i) + "_attention") + std::string("layers." + std::to_string(i) + ".attn.c_attn") .c_str() /*name*/ ); break; @@ -144,8 +145,9 @@ void STARCODER::create_starcoder_model( true, startcoder_config.layer_norm_epsilon, true, + false, DT_NONE, - std::string("layers_" + std::to_string(i) + "_ln_2").c_str()); + std::string("layers." + std::to_string(i) + ".ln_2").c_str()); residual = res_ln_outputs[0]; Tensor l2_norm = res_ln_outputs[1]; @@ -161,7 +163,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str()); c_fc = ff.gelu(c_fc); @@ -176,7 +178,7 @@ void STARCODER::create_starcoder_model( nullptr, REG_MODE_NONE, 0.0f, - std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str()); + std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str()); } // final normalization and linear ff.residual_layer_norm(residual, @@ -188,8 +190,9 @@ void STARCODER::create_starcoder_model( true, startcoder_config.layer_norm_epsilon, true, + false, DT_NONE, - "transformer_ln_f"); + "ln_f"); Tensor ln_f = res_ln_outputs[1]; Tensor lm_head = ff.dense(ln_f, diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt new file mode 100644 index 0000000000..e0bad79cab --- /dev/null +++ b/inference/peft/CMakeLists.txt @@ -0,0 +1,139 @@ +cmake_minimum_required(VERSION 3.10) + +project(FlexFlow_Peft) + +# Normal PEFT +set(project_target1 peft) +set(CPU_SRC1 + ${FLEXFLOW_CPP_DRV_SRC} + peft.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target1} ${CPU_SRC1}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target1} DESTINATION ${BIN_DEST}) + +# FWD benchmark +set(project_target2 peft_fwd_benchmark) +set(CPU_SRC2 + ${FLEXFLOW_CPP_DRV_SRC} + peft_fwd_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target2} ${CPU_SRC2}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target2} ${CPU_SRC2}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target2} DESTINATION ${BIN_DEST}) + +# BWD benchmark +set(project_target3 peft_bwd_benchmark) +set(CPU_SRC3 + ${FLEXFLOW_CPP_DRV_SRC} + peft_bwd_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target3} ${CPU_SRC3}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target3} DESTINATION ${BIN_DEST}) + +# Online peft +set(project_target4 req_rate_benchmark) +set(CPU_SRC4 + ${FLEXFLOW_CPP_DRV_SRC} + req_rate_benchmark.cc + ../models/llama.cc + ../models/opt.cc + ../models/falcon.cc + ../models/starcoder.cc + ../models/mpt.cc) + +if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") + cuda_add_executable(${project_target4} ${CPU_SRC4}) + if (FF_GPU_BACKEND STREQUAL "hip_cuda") + target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_NVIDIA__) + endif() +elseif(FF_GPU_BACKEND STREQUAL "hip_rocm") + set_source_files_properties(${CPU_SRC4} PROPERTIES LANGUAGE HIP) + hip_add_executable(${project_target4} ${CPU_SRC4}) + if (FF_HIP_ARCH STREQUAL "") + message(FATAL_ERROR "FF_HIP_ARCH is empty!") + endif() + set_property(TARGET ${project_target4} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}") + target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_AMD__) +else() + message(FATAL_ERROR "Compilation of ${project_target4} for ${FF_GPU_BACKEND} backend not yet supported") +endif() + +target_include_directories(${project_target4} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR}) +target_include_directories(${project_target4} PRIVATE ${CMAKE_SOURCE_DIR}/inference) +target_link_libraries(${project_target4} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) +set(BIN_DEST "bin") +install(TARGETS ${project_target4} DESTINATION ${BIN_DEST}) diff --git a/inference/peft/Makefile b/inference/peft/Makefile new file mode 100644 index 0000000000..0e4b79f51f --- /dev/null +++ b/inference/peft/Makefile @@ -0,0 +1,37 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Flags for directing the runtime makefile what to include +DEBUG ?= 0 # Include debugging symbols +MAX_DIM ?= 4 # Maximum number of dimensions +OUTPUT_LEVEL ?= LEVEL_DEBUG # Compile time logging level +USE_CUDA ?= 1 # Include CUDA support (requires CUDA) +USE_GASNET ?= 0 # Include GASNet support (requires GASNet) +USE_HDF ?= 1 # Include HDF5 support (requires HDF5) +ALT_MAPPERS ?= 0 # Include alternative mappers (not recommended) + +# Put the binary file name here +OUTFILE ?= llama_pipeline +# List all the application source files here +ifndef CUDA_HOME +CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1)) +endif + + +ifndef FF_HOME +$(error FF_HOME variable is not defined, aborting build) +endif + +include $(FF_HOME)/FlexFlow.mk diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc new file mode 100644 index 0000000000..c55f2c0bfd --- /dev/null +++ b/inference/peft/peft.cc @@ -0,0 +1,387 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string dataset_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // dataset for finetuning + if (!strcmp(argv[i], "-finetuning-dataset")) { + paths.dataset_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 1; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + bool enable_peft_finetuning = true; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + LoraOptimizerConfig *optim_config = nullptr; + if (enable_peft_finetuning) { + // float sgd_learning_rate = 2e-1; + float sgd_learning_rate = 1.0f; + optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate); + } + LoraLinearConfig peft_config_finetuning = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, + peft_model_name, + true /*trainable*/, + optim_config, + false /*init_lora_weights*/, + llm_model_name, + use_full_precision ? "fp32" : "fp16"); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + if (enable_peft_finetuning) { + peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning); + } + } + + // Start background server + rm->start_background_server(&model); + + // Run workload + { + std::vector requests; + + // Add inference requests + if (!file_paths.prompt_file_path.empty()) { + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + int total_num_requests = 0; + for (auto &prompt : prompt_json) { + std::string text = prompt.get(); + printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str()); + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + total_num_requests++; + } + } + + // Add fine-tuning request + if (enable_peft_finetuning) { + assert(!file_paths.dataset_file_path.empty() && + "Dataset file path is required for fine-tuning."); + printf("Finetuning request with dataset %s\n", + file_paths.dataset_file_path.c_str()); + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.peft_model_id = (peft_model_id_finetuning != nullptr) + ? *peft_model_id_finetuning + : PEFTModelID::NO_ID; + fine_tuning_req.dataset_filepath = file_paths.dataset_file_path; + fine_tuning_req.max_training_steps = 2; + requests.push_back(fine_tuning_req); + } + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc new file mode 100644 index 0000000000..86d6d8cbbf --- /dev/null +++ b/inference/peft/peft_bwd_benchmark.cc @@ -0,0 +1,391 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_requests_to_run) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-to-run")) { + max_requests_to_run = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_requests_to_run = 1000000000; + bool enable_peft_finetuning = false; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_requests_to_run); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + // Start background server + rm->start_background_server(&model); + + // Warmup stage + { + std::vector requests; + for (int i = 0; i < 100; i++) { + Request inference_req; + inference_req.benchmarking_tokens = 128; + inference_req.max_sequence_length = 256; + inference_req.warmup = true; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.warmup = true; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + std::vector result = model.generate(requests); + } + + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + + // Run workload + { + std::vector requests; + + // Add inference requests + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector lengths; + int index = 0; + for (auto &entry : prompt_json) { + if (index == max_requests_to_run) { + break; + } + int prompt_length = entry.get(); + assert(prompt_length > 0 && "Prompt length must be greater than 0."); + assert(prompt_length <= 1024 && + "Prompt length must be less than or equal to 1024."); + lengths.push_back(prompt_length); + index++; + } + printf("Total number of finetuning requests: %ld", lengths.size()); + + // Add fine-tuning requests + for (int i = 0; i < lengths.size(); i++) { + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = lengths[i]; + fine_tuning_req.max_sequence_length = lengths[i]; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + } + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------finetuning finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc new file mode 100644 index 0000000000..9ff042c157 --- /dev/null +++ b/inference/peft/peft_fwd_benchmark.cc @@ -0,0 +1,363 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "models/falcon.h" +#include "models/llama.h" +#include "models/mpt.h" +#include "models/opt.h" +#include "models/starcoder.h" +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_requests_to_run) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-to-run")) { + max_requests_to_run = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_requests_to_run = 1000000000; + bool enable_peft_finetuning = false; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_requests_to_run); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + // Start background server + rm->start_background_server(&model); + + // Run workload + { + std::vector requests; + + // Add inference requests + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + std::vector> prompts; + int index = 0; + for (auto &entry : prompt_json) { + if (index >= max_requests_to_run) { + break; + } + int prompt_length = entry["human"]; + int sequence_length = entry["gpt"]; + assert(prompt_length + sequence_length <= max_sequence_length && + "Prompt + sequence length exceeds max sequence length"); + prompts.push_back(std::make_pair(prompt_length, sequence_length)); + index++; + } + printf("Total number of prompts: %ld", prompts.size()); + for (auto &prompt : prompts) { + // printf("Prompt length: %d, sequence length: %d\n", prompt_length, + // sequence_length); + Request inference_req; + inference_req.benchmarking_tokens = prompt.first; + inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + std::vector result = model.generate(requests); + } + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } + + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc new file mode 100644 index 0000000000..43008e74fe --- /dev/null +++ b/inference/peft/req_rate_benchmark.cc @@ -0,0 +1,518 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/inference.h" +#include "flexflow/request_manager.h" +#include "inference/models/falcon.h" +#include "inference/models/llama.h" +#include "inference/models/mpt.h" +#include "inference/models/opt.h" +#include "inference/models/starcoder.h" +#include +#include +#include +#include + +#include + +using namespace FlexFlow; +using namespace Legion; +using json = nlohmann::json; + +Legion::Logger log_app("llama"); + +class ConcurrentQueue { +public: + std::queue inf_queue; + std::queue peft_queue; + std::mutex request_queue_mutex; + bool producer_finished = false; +}; + +ConcurrentQueue *common_guids_singleton = nullptr; +int nb_millisecs = 1000; // Default bucket timeframe is 1 second + +ConcurrentQueue *get_common_guids_queue() { + if (common_guids_singleton == nullptr) { + common_guids_singleton = new ConcurrentQueue(); + } + return common_guids_singleton; +} + +void consume() { + RequestManager *rm = RequestManager::get_request_manager(); + ConcurrentQueue *guids = get_common_guids_queue(); + bool producer_is_finished = false; + bool queue_is_empty = false; + // int i=0; + while (!producer_is_finished || !queue_is_empty) { + RequestManager::RequestGuid guid = RequestManager::INVALID_GUID; + { + const std::lock_guard lock(guids->request_queue_mutex); + queue_is_empty = guids->inf_queue.empty(); + producer_is_finished = guids->producer_finished; + if (!queue_is_empty) { + guid = guids->inf_queue.front(); + guids->inf_queue.pop(); + } + } + if (guid != RequestManager::INVALID_GUID) { + GenerationResult result = rm->get_generation_result(guid); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(nb_millisecs)); + } + // i++; + // cout << "Iteration " << i; + } + rm->set_inference_finished(); + + while (guids->peft_queue.size() > 0) { + GenerationResult result = + rm->get_generation_result(guids->peft_queue.front()); + guids->peft_queue.pop(); + } +} + +struct FilePaths { + std::string cache_folder_path; + std::string prompt_file_path; + std::string output_file_path; +}; + +void parse_input_args(char **argv, + int argc, + FilePaths &paths, + std::string &llm_model_name, + std::string &peft_model_name, + bool &use_full_precision, + bool &verbose, + bool &do_sample, + bool &enable_peft, + float &temperature, + float &topp, + int &max_requests_per_batch, + int &max_tokens_per_batch, + int &max_sequence_length, + int &max_buckets_to_run, + int &bucket_timeframe) { + for (int i = 1; i < argc; i++) { + // llm model type + if (!strcmp(argv[i], "-llm-model")) { + llm_model_name = std::string(argv[++i]); + for (char &c : llm_model_name) { + c = std::tolower(c); + } + continue; + } + if (!strcmp(argv[i], "-enable-peft")) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-model")) { + peft_model_name = std::string(argv[++i]); + for (char &c : peft_model_name) { + c = std::tolower(c); + } + continue; + } + // cache folder + if (!strcmp(argv[i], "-cache-folder")) { + paths.cache_folder_path = std::string(argv[++i]); + continue; + } + // prompts + if (!strcmp(argv[i], "-prompt")) { + paths.prompt_file_path = std::string(argv[++i]); + continue; + } + // output file + if (!strcmp(argv[i], "-output-file")) { + paths.output_file_path = std::string(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--use-full-precision")) { + use_full_precision = true; + continue; + } + // verbose logging to stdout + if (!strcmp(argv[i], "--verbose")) { + verbose = true; + continue; + } + if (!strcmp(argv[i], "--do-sample")) { + do_sample = true; + continue; + } + if (!strcmp(argv[i], "--temperature")) { + temperature = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--topp")) { + topp = std::stof(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-requests-per-batch")) { + max_requests_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-tokens-per-batch")) { + max_tokens_per_batch = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-sequence-length")) { + max_sequence_length = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--max-buckets-to-run")) { + max_buckets_to_run = std::stoi(argv[++i]); + continue; + } + if (!strcmp(argv[i], "--bucket-timeframe")) { + bucket_timeframe = std::stoi(argv[++i]); + continue; + } + } + if (paths.cache_folder_path.empty()) { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path) + : std::string("~/.cache/flexflow"); + } + // Expand ~ to the home directory if needed + wordexp_t p; + wordexp(paths.cache_folder_path.c_str(), &p, 0); + paths.cache_folder_path = p.we_wordv[0]; + wordfree(&p); +} + +void FlexFlow::top_level_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + FFConfig ffconfig; + if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) { + assert(false && "Doesn't support quantization in non-offload mode"); + } + FilePaths file_paths; + std::string llm_model_name, peft_model_name; + bool use_full_precision = false; + bool verbose = false; + bool do_sample = false; + bool enable_peft = false; + float temperature = 0.0f; + float topp = 0.0f; + int max_requests_per_batch = 8; + int max_tokens_per_batch = 128; + int max_sequence_length = 256; + int max_buckets_to_run = 1000000000; + bool enable_peft_finetuning = false; + int bucket_timespan = 1; + + InputArgs const &command_args = HighLevelRuntime::get_input_args(); + char **argv = command_args.argv; + int argc = command_args.argc; + parse_input_args(argv, + argc, + file_paths, + llm_model_name, + peft_model_name, + use_full_precision, + verbose, + do_sample, + enable_peft, + temperature, + topp, + max_requests_per_batch, + max_tokens_per_batch, + max_sequence_length, + max_buckets_to_run, + bucket_timespan); + assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * + ffconfig.pipeline_parallelism_degree == + ffconfig.numNodes * ffconfig.workersPerNode); + + std::string config_filepath = join_path( + {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"}); + std::string tokenizer_filepath = + join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name}); + std::string weights_filepath = + join_path({file_paths.cache_folder_path, + "weights", + llm_model_name, + use_full_precision ? "full-precision" : "half-precision"}); + std::ifstream config_file_handle(config_filepath); + if (!config_file_handle.good()) { + std::cout << "Model config file " << config_filepath << " not found." + << std::endl; + assert(false); + } + if (enable_peft && peft_model_name.empty()) { + std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl; + assert(false); + } else if (!enable_peft && !peft_model_name.empty()) { + std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl; + assert(false); + } + + json model_config = json::parse(config_file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + ModelType model_type = ModelType::UNKNOWN; + auto architectures = model_config["architectures"]; + for (auto const &str : architectures) { + if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { + model_type = ModelType::LLAMA; + break; + } else if (str == "OPTForCausalLM") { + model_type = ModelType::OPT; + break; + } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") { + model_type = ModelType::FALCON; + break; + } else if (str == "GPTBigCodeForCausalLM") { + model_type = ModelType::STARCODER; + break; + } else if (str == "MPTForCausalLM") { + model_type = ModelType::MPT; + break; + } + } + int bos_token_id = model_config.find("bos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("bos_token_id"); + int eos_token_id = model_config.find("eos_token_id") == model_config.end() + ? -1 + : (int)model_config.at("eos_token_id"); + + assert(model_type != ModelType::UNKNOWN && + "Invalid LLM model type passed (or no type was passed)."); + + // load PEFT config + LoraLinearConfig peft_config = + peft_model_name.empty() + ? LoraLinearConfig::EmptyConfig + : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name); + + GenerationConfig generationConfig(do_sample, temperature, topp); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_max_requests_per_batch( + max_requests_per_batch + + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_tokens_per_batch(max_tokens_per_batch); + rm->set_max_sequence_length(max_sequence_length); + rm->register_tokenizer( + model_type, bos_token_id, eos_token_id, tokenizer_filepath); + rm->register_output_filepath(file_paths.output_file_path); + rm->set_enable_peft_finetuning(enable_peft_finetuning); + + FFModel model(ffconfig, ffconfig.cpu_offload); + if (model_type == ModelType::LLAMA) { + LLAMA::create_llama_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::OPT) { + OPT::create_opt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::FALCON) { + FALCON::create_falcon_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + use_full_precision); + } else if (model_type == ModelType::STARCODER) { + STARCODER::create_starcoder_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else if (model_type == ModelType::MPT) { + MPT::create_mpt_model(model, + config_filepath, + weights_filepath, + INC_DECODING_MODE, + generationConfig, + use_full_precision); + } else { + assert(false && "unknow model type"); + } + + // Add PEFT layer + PEFTModelID *peft_model_id = nullptr; + if (!peft_model_name.empty()) { + peft_model_id = model.add_lora_layer(peft_config); + } + + rm->start_background_server(&model); + + // Warmup stage + { + std::vector requests; + for (int i = 0; i < 100; i++) { + Request inference_req; + inference_req.benchmarking_tokens = 128; + inference_req.max_sequence_length = 256; + inference_req.warmup = true; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.warmup = true; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1; + requests.push_back(fine_tuning_req); + std::vector result = model.generate(requests); + } + + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + + // Now run online workload! + + nb_millisecs = nb_millisecs * bucket_timespan; + int total_num_requests = 0; + int num_arrival_buckets = 0; + ConcurrentQueue *guids = get_common_guids_queue(); + std::thread consumer{consume}; + { + + // Load all requests in advance + using json = nlohmann::json; + std::ifstream file_handle(file_paths.prompt_file_path); + assert(file_handle.good() && "Prompt file does not exist."); + json prompt_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + auto const &lists = prompt_json.get>>(); + std::vector bucket_arrival_times_s; + std::vector>> buckets; + + size_t index = 0; + for (auto const &list : lists) { + if (!list.empty()) { + bucket_arrival_times_s.push_back(index); + std::vector> prompts; + for (auto const &dict : list) { + int prompt_length = dict["human"]; + int sequence_length = dict["gpt"]; + assert(prompt_length + sequence_length <= max_sequence_length && + "Prompt + sequence length exceeds max sequence length"); + prompts.push_back(std::make_pair(prompt_length, sequence_length)); + } + buckets.push_back(prompts); + } + index++; + } + assert(bucket_arrival_times_s.size() == buckets.size() && + "Bucket arrival times and buckets are not the same size"); + // for (int i=0; i<10; i++) { + // printf("bucket_arrival_times_s[%i]: %i\n", i, + // bucket_arrival_times_s[i]); printf("bucket[%i]: %i\n", i, + // buckets[i].size()); for (const auto& prompt : buckets[i]) { + // printf("\tprompt: %i, %i\n", prompt.first, prompt.second); + // } + // } + + // Add fine-tuning request + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.benchmarking_tokens = 1024; + fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + fine_tuning_req.max_training_steps = 1000000000; + RequestManager::RequestGuid ft_guid = + rm->register_new_peft_request(fine_tuning_req); + if (ft_guid != RequestManager::INVALID_GUID) { + const std::lock_guard lock(guids->request_queue_mutex); + guids->peft_queue.push(ft_guid); + } + + // Replay the trace of inference requests + auto start_time = std::chrono::steady_clock::now(); + for (int i = 0; i < bucket_arrival_times_s.size(); i++) { + if (bucket_arrival_times_s[i] >= max_buckets_to_run) { + break; + } + // sleep until bucket arrives + auto bucket_arrival_time = + start_time + + std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs); + std::this_thread::sleep_until(bucket_arrival_time); + + // create inference requests for the bucket + std::vector requests; + for (auto const &prompt : buckets[i]) { + // printf("Prompt length: %d, sequence length: %d\n", prompt_length, + // sequence_length); + Request inference_req; + inference_req.benchmarking_tokens = prompt.first; + inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.peft_model_id = + (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; + requests.push_back(inference_req); + } + + { + const std::lock_guard lock(guids->request_queue_mutex); + for (int i = 0; i < requests.size(); i++) { + RequestManager::RequestGuid guid = + rm->register_new_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + guids->inf_queue.push(guid); + } + } + } + } + + { // Notify the consumer that no more requests are incoming + const std::lock_guard lock(guids->request_queue_mutex); + guids->producer_finished = true; + } + } + + // Wait for consumer to finish + consumer.join(); + + // terminate the request manager by stopping the background thread + rm->terminate_background_server(); + + // Execution fence + { + Future future = runtime->issue_execution_fence(ctx); + future.get_void_result(); + } + + // float* data + std::cout << "----------inference finished--------------" << std::endl; + + // free tokenizer space in memory +} + +void FlexFlow::register_custom_tasks() {} diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py new file mode 100644 index 0000000000..a7d38a66b6 --- /dev/null +++ b/inference/python/ff_peft.py @@ -0,0 +1,189 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-config-file", + help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.", + type=str, + default="", + ) + args = parser.parse_args() + + # Load configs from JSON file (if specified) + if len(args.config_file) > 0: + if not os.path.isfile(args.config_file): + raise FileNotFoundError(f"Config file {args.config_file} not found.") + try: + with open(args.config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 2, + "memory_per_gpu": 14000, + "zero_copy_memory_per_node": 10000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": True, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "inference_debugging": True, + "fusion": False, + } + model_configs = { + # required parameters + "base_model": "JackFram/llama-160m", + "inference_peft_model_id": "goliaro/llama-160m-lora", + "finetuning_peft_model_id": "goliaro/llama-160m-lora", + # "base_model": "meta-llama/Meta-Llama-3-8B", + # "inference_peft_model_id": "goliaro/llama-3-8b-lora", + # "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly", + # optional parameters + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": True, + "prompt": "", + "finetuning_dataset": os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "../prompt/peft_dataset.json", + ), + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(model_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + # Add inference and/or finetuning lora + lora_inference_config = None + lora_finetuning_config = None + if len(configs.prompt) > 0: + lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.inference_peft_model_id, + base_model_name_or_path=configs.base_model, + ) + llm.add_peft(lora_inference_config) + if len(configs.finetuning_dataset) > 0: + # lora_finetuning_config = ff.LoraLinearConfig( + # llm.cache_path, + # configs.finetuning_peft_model_id, + # target_modules=["down_proj"], + # rank=16, + # lora_alpha=16, + # trainable=True, + # init_lora_weights=True, + # optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, + # ) + lora_finetuning_config = ff.LoraLinearConfig( + llm.cache_path, + configs.inference_peft_model_id, + trainable=True, + base_model_name_or_path=configs.base_model, + optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, + optimizer_kwargs={ + "learning_rate": 0.001, + "momentum": 0.0, + "weight_decay": 0.0, + "nesterov": False, + }, + ) + llm.add_peft(lora_finetuning_config) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + enable_peft_finetuning = len(configs.finetuning_dataset) > 0 + llm.compile( + generation_config, + enable_peft_finetuning=enable_peft_finetuning, + max_requests_per_batch=1 if not enable_peft_finetuning else 2, + max_seq_length=256, + max_tokens_per_batch=128, + ) + + llm.start_server() + + requests = [] + # Serving + if len(configs.prompt) > 0: + prompts = [s for s in json.load(open(configs.prompt))] + inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, + prompt=prompt, + max_sequence_length=128, + peft_model_id=llm.get_ff_peft_id(lora_inference_config), + ) + for prompt in prompts + ] + requests += inference_requests + # Finetuning + if len(configs.finetuning_dataset) > 0: + finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, + max_sequence_length=128, + peft_model_id=llm.get_ff_peft_id(lora_finetuning_config), + dataset_filepath=configs.finetuning_dataset, + max_training_steps=2, + ) + requests.append(finetuning_request) + + results = llm.generate(requests) + + llm.stop_server() + + +if __name__ == "__main__": + print("flexflow PEFT example") + main() diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 05599ea6b9..f888982f2c 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -51,9 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md new file mode 100644 index 0000000000..9b2a7a53b2 --- /dev/null +++ b/inference/python/peft_demo/INSTRUCTIONS.md @@ -0,0 +1,25 @@ +## Peft Demo +* `git clone -b peft --recursive https://github.com/flexflow/FlexFlow.git` +* `cd FlexFlow/` + +* If you wish to run the demo by installing FlexFlow + * `conda env create -f conda/flexflow.yml` + * `conda activate flexflow` + +* If you wish to run the demo using a Docker container + * `export FF_CUDA_ARCH=all && export cuda_version=12.0 && ./docker/build.sh flexflow && ./docker/run.sh flexflow` + +* Then, install the Llama2 model (the `meta-llama/Llama-2-7b-hf` model is gated, so make sure to add your HF access token) + + * `export HUGGINGFACE_TOKEN="[Your token]"` + * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"` + * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"` + +* Run the demo + ``` + mkdir inference/output + cd inference/python/peft_demo/ + python3 demo.py -config-file demo_config.json + ``` + + diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb new file mode 100644 index 0000000000..dfb5193a1d --- /dev/null +++ b/inference/python/peft_demo/demo.ipynb @@ -0,0 +1,1907 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FlexFlow Co-Serving Demo\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import json, random, subprocess, os\n", + "from datasets import load_dataset\n", + "from types import SimpleNamespace\n", + "from huggingface_hub import HfFolder\n", + "import flexflow.serve as ff\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):\n", + " \"\"\"Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.\n", + " Only the 'open_qa' and 'closed_qa' prompts without context are kept.\n", + " The datasets are saved into the files given as arguments.\n", + "\n", + " Keyword arguments:\n", + " dataset_size -- the number of prompts to consider\n", + " inference_file_path -- the file in which to save the inference data\n", + " finetuning_file_path -- the file in which to save the finetuning data\n", + " \"\"\"\n", + " dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n", + " inference_data = []\n", + " finetuning_data = []\n", + " for row in dataset:\n", + " if len(finetuning_data) == finetune_dataset_size:\n", + " break\n", + " if (\"open_qa\" in row['category'] or \"closed_qa\" in row['category']) and len(row['context']) == 0:\n", + " inference_data.append(row['instruction'])\n", + " finetuning_data.append(row['instruction'] + \" \" + row['response'])\n", + " with open(inference_file_path, 'w') as file:\n", + " json.dump(inference_data[:1], file)\n", + " with open(finetuning_file_path, 'w') as file:\n", + " json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': '))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration fields" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "configs_dict = {\n", + " \"num_gpus\": 1,\n", + " \"memory_per_gpu\": 21000,\n", + " \"zero_copy_memory_per_node\": 40000,\n", + " \"num_cpus\": 4,\n", + " \"legion_utility_processors\": 4,\n", + " \"data_parallelism_degree\": 1,\n", + " \"tensor_parallelism_degree\": 1,\n", + " \"pipeline_parallelism_degree\": 1,\n", + " \"offload\": False,\n", + " \"offload_reserve_space_size\": 8 * 1024, # 8GB\n", + " \"use_4bit_quantization\": False,\n", + " \"use_8bit_quantization\": False,\n", + " \"enable_peft\": True,\n", + " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", + " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", + " \"profiling\": False,\n", + " \"inference_debugging\": False,\n", + " \"fusion\": False,\n", + " \"max_requests_per_batch\": 1,\n", + " \"max_sequence_length\": 128,\n", + " \"max_tokens_per_batch\": 128,\n", + " \"max_training_steps\": 100,\n", + " \"seed\": 42,\n", + "}\n", + "model_configs = {\n", + " \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n", + " \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n", + " \"refresh_cache\": False,\n", + " \"full_precision\": False,\n", + " # relative paths\n", + " \"inference_dataset\": \"inference_dataset.json\",\n", + " \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n", + " \"output_file\": \"peft_demo.txt\",\n", + "}\n", + "generation_configs = {\n", + " \"do_sample\": False,\n", + " \"temperature\": 0.9,\n", + " \"topp\": 0.8,\n", + " \"topk\": 1,\n", + "}\n", + "finetuning_configs = {\n", + " \"learning_rate\": 0.001,\n", + " \"momentum\": 0.0,\n", + " \"weight_decay\": 0.0,\n", + " \"nesterov\": False,\n", + "}\n", + "# Merge dictionaries\n", + "configs_dict.update(model_configs)\n", + "configs_dict.update(generation_configs)\n", + "configs_dict.update(finetuning_configs)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "random.seed(configs_dict[\"seed\"])\n", + "\n", + "configs = SimpleNamespace(**configs_dict)\n", + "\n", + "create_datasets(inference_file_path=configs_dict[\"inference_dataset\"], \n", + " finetuning_file_path=configs_dict[\"finetuning_dataset\"])\n", + "\n", + "# Clear output file\n", + "with open(configs.output_file, 'w') as file:\n", + " file.write('')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download base and peft inference models" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Loading tokenizer...\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Loading tokenizer...\n" + ] + }, + { + "data": { + "text/plain": [ + "CompletedProcess(args=['python', '../../utils/download_peft_model.py', 'goliaro/llama-3-8b-lora', '--base_model_name', 'meta-llama/Meta-Llama-3-8B'], returncode=0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n", + "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize FlexFlow runtime and LLM object" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 - 7f4d49d21280] 0.672934 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.672995 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.673107 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.673118 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7f4d49d21280] 0.673124 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "workSpaceSize (128 MB)\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n", + "Loading tokenizer...\n", + "Adding layer layers.0.mlp.down_proj.lora\n", + "Adding layer layers.1.mlp.down_proj.lora\n", + "Adding layer layers.2.mlp.down_proj.lora\n", + "Adding layer layers.3.mlp.down_proj.lora\n", + "Adding layer layers.4.mlp.down_proj.lora\n", + "Adding layer layers.5.mlp.down_proj.lora\n", + "Adding layer layers.6.mlp.down_proj.lora\n", + "Adding layer layers.7.mlp.down_proj.lora\n", + "Adding layer layers.8.mlp.down_proj.lora\n", + "Adding layer layers.9.mlp.down_proj.lora\n", + "Adding layer layers.10.mlp.down_proj.lora\n", + "Adding layer layers.11.mlp.down_proj.lora\n", + "Adding layer layers.12.mlp.down_proj.lora\n", + "Adding layer layers.13.mlp.down_proj.lora\n", + "Adding layer layers.14.mlp.down_proj.lora\n", + "Adding layer layers.15.mlp.down_proj.lora\n", + "Adding layer layers.16.mlp.down_proj.lora\n", + "Adding layer layers.17.mlp.down_proj.lora\n", + "Adding layer layers.18.mlp.down_proj.lora\n", + "Adding layer layers.19.mlp.down_proj.lora\n", + "Adding layer layers.20.mlp.down_proj.lora\n", + "Adding layer layers.21.mlp.down_proj.lora\n", + "Adding layer layers.22.mlp.down_proj.lora\n", + "Adding layer layers.23.mlp.down_proj.lora\n", + "Adding layer layers.24.mlp.down_proj.lora\n", + "Adding layer layers.25.mlp.down_proj.lora\n", + "Adding layer layers.26.mlp.down_proj.lora\n", + "Adding layer layers.27.mlp.down_proj.lora\n", + "Adding layer layers.28.mlp.down_proj.lora\n", + "Adding layer layers.29.mlp.down_proj.lora\n", + "Adding layer layers.30.mlp.down_proj.lora\n", + "Adding layer layers.31.mlp.down_proj.lora\n" + ] + } + ], + "source": [ + "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n", + "ff.init(configs_dict)\n", + "\n", + "# Create the FlexFlow LLM\n", + "ff_data_type = (\n", + " ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n", + ")\n", + "llm = ff.LLM(\n", + " configs.base_model,\n", + " data_type=ff_data_type,\n", + " cache_path=configs.cache_path,\n", + " refresh_cache=configs.refresh_cache,\n", + " output_file=configs.output_file,\n", + ")\n", + "# Add inference and/or finetuning lora\n", + "lora_inference_config = None\n", + "lora_finetuning_config = None\n", + "if len(configs.inference_dataset) > 0:\n", + " lora_inference_config = ff.LoraLinearConfig(\n", + " llm.cache_path, \n", + " configs.inference_peft_model_id,\n", + " base_model_name_or_path=configs.base_model\n", + " )\n", + " llm.add_peft(lora_inference_config)\n", + "if len(configs.finetuning_dataset) > 0:\n", + " lora_finetuning_config = ff.LoraLinearConfig(\n", + " llm.cache_path,\n", + " configs.finetuning_peft_model_id,\n", + " trainable=True,\n", + " init_lora_weights=False,\n", + " rank=16,\n", + " lora_alpha=16.0,\n", + " # target_modules = [\"down_proj\"],\n", + " base_model_name_or_path=configs.base_model,\n", + " optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,\n", + " optimizer_kwargs={\n", + " \"learning_rate\": configs.learning_rate,\n", + " \"momentum\": configs.momentum,\n", + " \"weight_decay\": configs.weight_decay,\n", + " \"nesterov\": configs.nesterov,\n", + " },\n", + " )\n", + " llm.add_peft(lora_finetuning_config)\n", + "\n", + "# Compile the LLM for inference and load the weights into memory\n", + "generation_config = ff.GenerationConfig(\n", + " do_sample=configs.do_sample,\n", + " temperature=configs.temperature,\n", + " topp=configs.topp,\n", + " topk=configs.topk\n", + ")\n", + "enable_peft_finetuning = len(configs.finetuning_dataset) > 0\n", + "llm.compile(\n", + " generation_config,\n", + " enable_peft_finetuning=enable_peft_finetuning,\n", + " max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning),\n", + " max_seq_length=configs.max_sequence_length,\n", + " max_tokens_per_batch=configs.max_tokens_per_batch,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start the LLM Co-serving system" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Background server started.\n", + "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Starting background serving task.\n", + "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Updated models' configuration.\n", + "###PEFT DEBUGGING### LLM Model object exists.\n", + "###PEFT DEBUGGING### Model object exists.\n", + "###PEFT DEBUGGING### Model object still exists.\n", + "###PEFT DEBUGGING### Entering compile_inference.\n", + "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n" + ] + } + ], + "source": [ + "llm.start_server()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate inference" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "###PEFT DEBUGGING### Launching graph optimization task.\n", + "[]\n", + "num_nodes = 1 num_gpus_per_node = 1\n", + "[0]10445\n", + "[1]649\n", + "[2]6730\n", + "[3]2053\n", + "[4]18167\n", + "[5]369\n", + "[6]1317\n", + "[7]2085\n", + "[8]3090\n", + "[9]30\n", + "No small speculative model registered, using incremental decoding.\n", + "[0 - 7f4d49d21280] 1.600215 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n", + "optimal_views.size = 262\n", + "views.size() = 262\n", + "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n", + "###PEFT DEBUGGING### Starting inplace optimizations.\n", + "###PEFT DEBUGGING### Mapping output tensors.\n", + "ndim(1) dims[1 0 0 0]\n", + "###PEFT DEBUGGING### Setting up NCCL communications.\n", + "###PEFT DEBUGGING### compile_inference completed successfully.\n", + "Loading weight file embed_tokens.weight\n", + "Loading weight file layers.0.input_layernorm.weight\n", + "Loading weight file layers.0.self_attn.q_proj.weight\n", + "Loading weight file layers.0.self_attn.k_proj.weight\n", + "Loading weight file layers.0.self_attn.v_proj.weight\n", + "Loading weight file layers.0.self_attn.o_proj.weight\n", + "Loading weight file layers.0.post_attention_layernorm.weight\n", + "Loading weight file layers.0.mlp.gate_proj.weight\n", + "Loading weight file layers.0.mlp.up_proj.weight\n", + "Loading weight file layers.0.mlp.down_proj.weight\n", + "Loading weight file layers.1.input_layernorm.weight\n", + "Loading weight file layers.1.self_attn.q_proj.weight\n", + "Loading weight file layers.1.self_attn.k_proj.weight\n", + "Loading weight file layers.1.self_attn.v_proj.weight\n", + "Loading weight file layers.1.self_attn.o_proj.weight\n", + "Loading weight file layers.1.post_attention_layernorm.weight\n", + "Loading weight file layers.1.mlp.gate_proj.weight\n", + "Loading weight file layers.1.mlp.up_proj.weight\n", + "Loading weight file layers.1.mlp.down_proj.weight\n", + "Loading weight file layers.2.input_layernorm.weight\n", + "Loading weight file layers.2.self_attn.q_proj.weight\n", + "Loading weight file layers.2.self_attn.k_proj.weight\n", + "Loading weight file layers.2.self_attn.v_proj.weight\n", + "Loading weight file layers.2.self_attn.o_proj.weight\n", + "Loading weight file layers.2.post_attention_layernorm.weight\n", + "Loading weight file layers.2.mlp.gate_proj.weight\n", + "Loading weight file layers.2.mlp.up_proj.weight\n", + "Loading weight file layers.2.mlp.down_proj.weight\n", + "Loading weight file layers.3.input_layernorm.weight\n", + "Loading weight file layers.3.self_attn.q_proj.weight\n", + "Loading weight file layers.3.self_attn.k_proj.weight\n", + "Loading weight file layers.3.self_attn.v_proj.weight\n", + "Loading weight file layers.3.self_attn.o_proj.weight\n", + "Loading weight file layers.3.post_attention_layernorm.weight\n", + "Loading weight file layers.3.mlp.gate_proj.weight\n", + "Loading weight file layers.3.mlp.up_proj.weight\n", + "Loading weight file layers.3.mlp.down_proj.weight\n", + "Loading weight file layers.4.input_layernorm.weight\n", + "Loading weight file layers.4.self_attn.q_proj.weight\n", + "Loading weight file layers.4.self_attn.k_proj.weight\n", + "Loading weight file layers.4.self_attn.v_proj.weight\n", + "Loading weight file layers.4.self_attn.o_proj.weight\n", + "Loading weight file layers.4.post_attention_layernorm.weight\n", + "Loading weight file layers.4.mlp.gate_proj.weight\n", + "Loading weight file layers.4.mlp.up_proj.weight\n", + "Loading weight file layers.4.mlp.down_proj.weight\n", + "Loading weight file layers.5.input_layernorm.weight\n", + "Loading weight file layers.5.self_attn.q_proj.weight\n", + "Loading weight file layers.5.self_attn.k_proj.weight\n", + "Loading weight file layers.5.self_attn.v_proj.weight\n", + "Loading weight file layers.5.self_attn.o_proj.weight\n", + "Loading weight file layers.5.post_attention_layernorm.weight\n", + "Loading weight file layers.5.mlp.gate_proj.weight\n", + "Loading weight file layers.5.mlp.up_proj.weight\n", + "Loading weight file layers.5.mlp.down_proj.weight\n", + "Loading weight file layers.6.input_layernorm.weight\n", + "Loading weight file layers.6.self_attn.q_proj.weight\n", + "Loading weight file layers.6.self_attn.k_proj.weight\n", + "Loading weight file layers.6.self_attn.v_proj.weight\n", + "Loading weight file layers.6.self_attn.o_proj.weight\n", + "Loading weight file layers.6.post_attention_layernorm.weight\n", + "Loading weight file layers.6.mlp.gate_proj.weight\n", + "Loading weight file layers.6.mlp.up_proj.weight\n", + "Loading weight file layers.6.mlp.down_proj.weight\n", + "Loading weight file layers.7.input_layernorm.weight\n", + "Loading weight file layers.7.self_attn.q_proj.weight\n", + "Loading weight file layers.7.self_attn.k_proj.weight\n", + "Loading weight file layers.7.self_attn.v_proj.weight\n", + "Loading weight file layers.7.self_attn.o_proj.weight\n", + "Loading weight file layers.7.post_attention_layernorm.weight\n", + "Loading weight file layers.7.mlp.gate_proj.weight\n", + "Loading weight file layers.7.mlp.up_proj.weight\n", + "Loading weight file layers.7.mlp.down_proj.weight\n", + "Loading weight file layers.8.input_layernorm.weight\n", + "Loading weight file layers.8.self_attn.q_proj.weight\n", + "Loading weight file layers.8.self_attn.k_proj.weight\n", + "Loading weight file layers.8.self_attn.v_proj.weight\n", + "Loading weight file layers.8.self_attn.o_proj.weight\n", + "Loading weight file layers.8.post_attention_layernorm.weight\n", + "Loading weight file layers.8.mlp.gate_proj.weight\n", + "Loading weight file layers.8.mlp.up_proj.weight\n", + "Loading weight file layers.8.mlp.down_proj.weight\n", + "Loading weight file layers.9.input_layernorm.weight\n", + "Loading weight file layers.9.self_attn.q_proj.weight\n", + "Loading weight file layers.9.self_attn.k_proj.weight\n", + "Loading weight file layers.9.self_attn.v_proj.weight\n", + "Loading weight file layers.9.self_attn.o_proj.weight\n", + "Loading weight file layers.9.post_attention_layernorm.weight\n", + "Loading weight file layers.9.mlp.gate_proj.weight\n", + "Loading weight file layers.9.mlp.up_proj.weight\n", + "Loading weight file layers.9.mlp.down_proj.weight\n", + "Loading weight file layers.10.input_layernorm.weight\n", + "Loading weight file layers.10.self_attn.q_proj.weight\n", + "Loading weight file layers.10.self_attn.k_proj.weight\n", + "Loading weight file layers.10.self_attn.v_proj.weight\n", + "Loading weight file layers.10.self_attn.o_proj.weight\n", + "Loading weight file layers.10.post_attention_layernorm.weight\n", + "Loading weight file layers.10.mlp.gate_proj.weight\n", + "Loading weight file layers.10.mlp.up_proj.weight\n", + "Loading weight file layers.10.mlp.down_proj.weight\n", + "Loading weight file layers.11.input_layernorm.weight\n", + "Loading weight file layers.11.self_attn.q_proj.weight\n", + "Loading weight file layers.11.self_attn.k_proj.weight\n", + "Loading weight file layers.11.self_attn.v_proj.weight\n", + "Loading weight file layers.11.self_attn.o_proj.weight\n", + "Loading weight file layers.11.post_attention_layernorm.weight\n", + "Loading weight file layers.11.mlp.gate_proj.weight\n", + "Loading weight file layers.11.mlp.up_proj.weight\n", + "Loading weight file layers.11.mlp.down_proj.weight\n", + "Loading weight file layers.12.input_layernorm.weight\n", + "Loading weight file layers.12.self_attn.q_proj.weight\n", + "Loading weight file layers.12.self_attn.k_proj.weight\n", + "Loading weight file layers.12.self_attn.v_proj.weight\n", + "Loading weight file layers.12.self_attn.o_proj.weight\n", + "Loading weight file layers.12.post_attention_layernorm.weight\n", + "Loading weight file layers.12.mlp.gate_proj.weight\n", + "Loading weight file layers.12.mlp.up_proj.weight\n", + "Loading weight file layers.12.mlp.down_proj.weight\n", + "Loading weight file layers.13.input_layernorm.weight\n", + "Loading weight file layers.13.self_attn.q_proj.weight\n", + "Loading weight file layers.13.self_attn.k_proj.weight\n", + "Loading weight file layers.13.self_attn.v_proj.weight\n", + "Loading weight file layers.13.self_attn.o_proj.weight\n", + "Loading weight file layers.13.post_attention_layernorm.weight\n", + "Loading weight file layers.13.mlp.gate_proj.weight\n", + "Loading weight file layers.13.mlp.up_proj.weight\n", + "Loading weight file layers.13.mlp.down_proj.weight\n", + "Loading weight file layers.14.input_layernorm.weight\n", + "Loading weight file layers.14.self_attn.q_proj.weight\n", + "Loading weight file layers.14.self_attn.k_proj.weight\n", + "Loading weight file layers.14.self_attn.v_proj.weight\n", + "Loading weight file layers.14.self_attn.o_proj.weight\n", + "Loading weight file layers.14.post_attention_layernorm.weight\n", + "Loading weight file layers.14.mlp.gate_proj.weight\n", + "Loading weight file layers.14.mlp.up_proj.weight\n", + "Loading weight file layers.14.mlp.down_proj.weight\n", + "Loading weight file layers.15.input_layernorm.weight\n", + "Loading weight file layers.15.self_attn.q_proj.weight\n", + "Loading weight file layers.15.self_attn.k_proj.weight\n", + "Loading weight file layers.15.self_attn.v_proj.weight\n", + "Loading weight file layers.15.self_attn.o_proj.weight\n", + "Loading weight file layers.15.post_attention_layernorm.weight\n", + "Loading weight file layers.15.mlp.gate_proj.weight\n", + "Loading weight file layers.15.mlp.up_proj.weight\n", + "Loading weight file layers.15.mlp.down_proj.weight\n", + "Loading weight file layers.16.input_layernorm.weight\n", + "Loading weight file layers.16.self_attn.q_proj.weight\n", + "Loading weight file layers.16.self_attn.k_proj.weight\n", + "Loading weight file layers.16.self_attn.v_proj.weight\n", + "Loading weight file layers.16.self_attn.o_proj.weight\n", + "Loading weight file layers.16.post_attention_layernorm.weight\n", + "Loading weight file layers.16.mlp.gate_proj.weight\n", + "Loading weight file layers.16.mlp.up_proj.weight\n", + "Loading weight file layers.16.mlp.down_proj.weight\n", + "Loading weight file layers.17.input_layernorm.weight\n", + "Loading weight file layers.17.self_attn.q_proj.weight\n", + "Loading weight file layers.17.self_attn.k_proj.weight\n", + "Loading weight file layers.17.self_attn.v_proj.weight\n", + "Loading weight file layers.17.self_attn.o_proj.weight\n", + "Loading weight file layers.17.post_attention_layernorm.weight\n", + "Loading weight file layers.17.mlp.gate_proj.weight\n", + "Loading weight file layers.17.mlp.up_proj.weight\n", + "Loading weight file layers.17.mlp.down_proj.weight\n", + "Loading weight file layers.18.input_layernorm.weight\n", + "Loading weight file layers.18.self_attn.q_proj.weight\n", + "Loading weight file layers.18.self_attn.k_proj.weight\n", + "Loading weight file layers.18.self_attn.v_proj.weight\n", + "Loading weight file layers.18.self_attn.o_proj.weight\n", + "Loading weight file layers.18.post_attention_layernorm.weight\n", + "Loading weight file layers.18.mlp.gate_proj.weight\n", + "Loading weight file layers.18.mlp.up_proj.weight\n", + "Loading weight file layers.18.mlp.down_proj.weight\n", + "Loading weight file layers.19.input_layernorm.weight\n", + "Loading weight file layers.19.self_attn.q_proj.weight\n", + "Loading weight file layers.19.self_attn.k_proj.weight\n", + "Loading weight file layers.19.self_attn.v_proj.weight\n", + "Loading weight file layers.19.self_attn.o_proj.weight\n", + "Loading weight file layers.19.post_attention_layernorm.weight\n", + "Loading weight file layers.19.mlp.gate_proj.weight\n", + "Loading weight file layers.19.mlp.up_proj.weight\n", + "Loading weight file layers.19.mlp.down_proj.weight\n", + "Loading weight file layers.20.input_layernorm.weight\n", + "Loading weight file layers.20.self_attn.q_proj.weight\n", + "Loading weight file layers.20.self_attn.k_proj.weight\n", + "Loading weight file layers.20.self_attn.v_proj.weight\n", + "Loading weight file layers.20.self_attn.o_proj.weight\n", + "Loading weight file layers.20.post_attention_layernorm.weight\n", + "Loading weight file layers.20.mlp.gate_proj.weight\n", + "Loading weight file layers.20.mlp.up_proj.weight\n", + "Loading weight file layers.20.mlp.down_proj.weight\n", + "Loading weight file layers.21.input_layernorm.weight\n", + "Loading weight file layers.21.self_attn.q_proj.weight\n", + "Loading weight file layers.21.self_attn.k_proj.weight\n", + "Loading weight file layers.21.self_attn.v_proj.weight\n", + "Loading weight file layers.21.self_attn.o_proj.weight\n", + "Loading weight file layers.21.post_attention_layernorm.weight\n", + "Loading weight file layers.21.mlp.gate_proj.weight\n", + "Loading weight file layers.21.mlp.up_proj.weight\n", + "Loading weight file layers.21.mlp.down_proj.weight\n", + "Loading weight file layers.22.input_layernorm.weight\n", + "Loading weight file layers.22.self_attn.q_proj.weight\n", + "Loading weight file layers.22.self_attn.k_proj.weight\n", + "Loading weight file layers.22.self_attn.v_proj.weight\n", + "Loading weight file layers.22.self_attn.o_proj.weight\n", + "Loading weight file layers.22.post_attention_layernorm.weight\n", + "Loading weight file layers.22.mlp.gate_proj.weight\n", + "Loading weight file layers.22.mlp.up_proj.weight\n", + "Loading weight file layers.22.mlp.down_proj.weight\n", + "Loading weight file layers.23.input_layernorm.weight\n", + "Loading weight file layers.23.self_attn.q_proj.weight\n", + "Loading weight file layers.23.self_attn.k_proj.weight\n", + "Loading weight file layers.23.self_attn.v_proj.weight\n", + "Loading weight file layers.23.self_attn.o_proj.weight\n", + "Loading weight file layers.23.post_attention_layernorm.weight\n", + "Loading weight file layers.23.mlp.gate_proj.weight\n", + "Loading weight file layers.23.mlp.up_proj.weight\n", + "Loading weight file layers.23.mlp.down_proj.weight\n", + "Loading weight file layers.24.input_layernorm.weight\n", + "Loading weight file layers.24.self_attn.q_proj.weight\n", + "Loading weight file layers.24.self_attn.k_proj.weight\n", + "Loading weight file layers.24.self_attn.v_proj.weight\n", + "Loading weight file layers.24.self_attn.o_proj.weight\n", + "Loading weight file layers.24.post_attention_layernorm.weight\n", + "Loading weight file layers.24.mlp.gate_proj.weight\n", + "Loading weight file layers.24.mlp.up_proj.weight\n", + "Loading weight file layers.24.mlp.down_proj.weight\n", + "Loading weight file layers.25.input_layernorm.weight\n", + "Loading weight file layers.25.self_attn.q_proj.weight\n", + "Loading weight file layers.25.self_attn.k_proj.weight\n", + "Loading weight file layers.25.self_attn.v_proj.weight\n", + "Loading weight file layers.25.self_attn.o_proj.weight\n", + "Loading weight file layers.25.post_attention_layernorm.weight\n", + "Loading weight file layers.25.mlp.gate_proj.weight\n", + "Loading weight file layers.25.mlp.up_proj.weight\n", + "Loading weight file layers.25.mlp.down_proj.weight\n", + "Loading weight file layers.26.input_layernorm.weight\n", + "Loading weight file layers.26.self_attn.q_proj.weight\n", + "Loading weight file layers.26.self_attn.k_proj.weight\n", + "Loading weight file layers.26.self_attn.v_proj.weight\n", + "Loading weight file layers.26.self_attn.o_proj.weight\n", + "Loading weight file layers.26.post_attention_layernorm.weight\n", + "Loading weight file layers.26.mlp.gate_proj.weight\n", + "Loading weight file layers.26.mlp.up_proj.weight\n", + "Loading weight file layers.26.mlp.down_proj.weight\n", + "Loading weight file layers.27.input_layernorm.weight\n", + "Loading weight file layers.27.self_attn.q_proj.weight\n", + "Loading weight file layers.27.self_attn.k_proj.weight\n", + "Loading weight file layers.27.self_attn.v_proj.weight\n", + "Loading weight file layers.27.self_attn.o_proj.weight\n", + "Loading weight file layers.27.post_attention_layernorm.weight\n", + "Loading weight file layers.27.mlp.gate_proj.weight\n", + "Loading weight file layers.27.mlp.up_proj.weight\n", + "Loading weight file layers.27.mlp.down_proj.weight\n", + "Loading weight file layers.28.input_layernorm.weight\n", + "Loading weight file layers.28.self_attn.q_proj.weight\n", + "Loading weight file layers.28.self_attn.k_proj.weight\n", + "Loading weight file layers.28.self_attn.v_proj.weight\n", + "Loading weight file layers.28.self_attn.o_proj.weight\n", + "Loading weight file layers.28.post_attention_layernorm.weight\n", + "Loading weight file layers.28.mlp.gate_proj.weight\n", + "Loading weight file layers.28.mlp.up_proj.weight\n", + "Loading weight file layers.28.mlp.down_proj.weight\n", + "Loading weight file layers.29.input_layernorm.weight\n", + "Loading weight file layers.29.self_attn.q_proj.weight\n", + "Loading weight file layers.29.self_attn.k_proj.weight\n", + "Loading weight file layers.29.self_attn.v_proj.weight\n", + "Loading weight file layers.29.self_attn.o_proj.weight\n", + "Loading weight file layers.29.post_attention_layernorm.weight\n", + "Loading weight file layers.29.mlp.gate_proj.weight\n", + "Loading weight file layers.29.mlp.up_proj.weight\n", + "Loading weight file layers.29.mlp.down_proj.weight\n", + "Loading weight file layers.30.input_layernorm.weight\n", + "Loading weight file layers.30.self_attn.q_proj.weight\n", + "Loading weight file layers.30.self_attn.k_proj.weight\n", + "Loading weight file layers.30.self_attn.v_proj.weight\n", + "Loading weight file layers.30.self_attn.o_proj.weight\n", + "Loading weight file layers.30.post_attention_layernorm.weight\n", + "Loading weight file layers.30.mlp.gate_proj.weight\n", + "Loading weight file layers.30.mlp.up_proj.weight\n", + "Loading weight file layers.30.mlp.down_proj.weight\n", + "Loading weight file layers.31.input_layernorm.weight\n", + "Loading weight file layers.31.self_attn.q_proj.weight\n", + "Loading weight file layers.31.self_attn.k_proj.weight\n", + "Loading weight file layers.31.self_attn.v_proj.weight\n", + "Loading weight file layers.31.self_attn.o_proj.weight\n", + "Loading weight file layers.31.post_attention_layernorm.weight\n", + "Loading weight file layers.31.mlp.gate_proj.weight\n", + "Loading weight file layers.31.mlp.up_proj.weight\n", + "Loading weight file layers.31.mlp.down_proj.weight\n", + "Loading weight file norm.weight\n", + "Loading weight file lm_head.weight\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "[0 - 7f4ce019c740] 24.015346 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0178740] 24.062661 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0190740] 24.128376 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0184740] 24.199797 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 24.255941 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0178740] 24.306545 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 24.357210 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0190740] 24.407958 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0178740] 24.459366 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0178740] 24.510618 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0178740] 24.560416 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0178740] 24.611335 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0178740] 24.663808 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0178740] 24.710965 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0178740] 24.756020 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 24.805719 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0178740] 24.858560 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 24.910607 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0178740] 24.958879 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 25.002851 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 25.050780 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 25.104554 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0184740] 25.159509 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 25.211003 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 25.261411 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0190740] 25.312357 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0184740] 25.362253 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0184740] 25.412284 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0184740] 25.461502 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0184740] 25.513610 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 25.564433 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0184740] 25.613662 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0184740] 25.663786 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0184740] 25.712708 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 25.762206 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0184740] 25.812755 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 25.863367 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0184740] 25.913378 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0184740] 25.965063 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0178740] 26.015739 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 26.065768 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0178740] 26.115556 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0184740] 26.166644 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0184740] 26.218528 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0178740] 26.269681 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0178740] 26.320250 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0178740] 26.371698 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 26.422587 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0178740] 26.474391 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 26.524817 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0190740] 26.575224 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0178740] 26.627207 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0190740] 26.679366 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0178740] 26.729921 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 26.779766 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 26.832104 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0184740] 26.884087 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 26.935580 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 26.992909 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0184740] 27.043722 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0184740] 27.093960 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0178740] 27.144937 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0190740] 27.196991 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0178740] 27.248143 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0190740] 27.299549 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0190740] 27.351395 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 27.402975 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0190740] 27.453662 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0178740] 27.504152 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0178740] 27.554072 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 27.605613 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0178740] 27.656807 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0190740] 27.707595 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0190740] 27.757815 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 27.809557 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 27.862148 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0190740] 27.914188 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0178740] 27.965942 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0184740] 28.017837 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0184740] 28.069997 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0184740] 28.122560 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0190740] 28.172513 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0190740] 28.224002 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0184740] 28.276536 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0184740] 28.327091 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 28.377124 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0190740] 28.427226 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0190740] 28.477499 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0184740] 28.528489 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0178740] 28.580135 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0190740] 28.631761 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 28.683392 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0184740] 28.734001 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0190740] 28.783914 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0190740] 28.835832 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0184740] 28.885271 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0190740] 28.936179 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0190740] 28.987163 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 29.038264 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0184740] 29.084248 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0184740] 29.129864 {3}{RequestManager}: Output token is: 3090\n", + "[0 - 7f4ce0184740] 29.175946 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7f4ce0184740] 29.226707 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7f4ce0184740] 29.277372 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7f4ce0184740] 29.329588 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 29.380856 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7f4ce0190740] 29.431483 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7f4ce0190740] 29.483399 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7f4ce0190740] 29.536268 {3}{RequestManager}: Output token is: 2144\n", + "[0 - 7f4ce0190740] 29.588317 {3}{RequestManager}: Output token is: 430\n", + "[0 - 7f4ce0184740] 29.638727 {3}{RequestManager}: Output token is: 6730\n", + "[0 - 7f4ce0190740] 29.689708 {3}{RequestManager}: Output token is: 2053\n", + "[0 - 7f4ce0190740] 29.740987 {3}{RequestManager}: Output token is: 649\n", + "[0 - 7f4ce0178740] 29.791166 {3}{RequestManager}: Output token is: 18167\n", + "[0 - 7f4ce0190740] 29.841776 {3}{RequestManager}: Output token is: 369\n", + "[0 - 7f4ce0184740] 29.893514 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7f4ce0178740] 29.945509 {3}{RequestManager}: Output token is: 2085\n", + "[0 - 7f4ce0178740] 29.945878 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n", + "[0 - 7f4ce0178740] 29.945889 {3}{RequestManager}: Final output: <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without\n", + "[0 - 7f4ce0178740] 29.945900 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(23696232.0) finish(29945893.0) latency(6249661.0) ttft(22415078.0)\n" + ] + } + ], + "source": [ + "prompts = [s for s in json.load(open(configs.inference_dataset))]\n", + "inference_requests = [\n", + " ff.Request(\n", + " ff.RequestType.REQ_INFERENCE,\n", + " prompt=prompt,\n", + " max_sequence_length=configs.max_sequence_length,\n", + " peft_model_id=llm.get_ff_peft_id(lora_inference_config),\n", + " )\n", + " for prompt in prompts\n", + "]\n", + "inf_req_res_1 = llm.generate(inference_requests)\n", + "with open(\"before_finetuning.txt\", \"w\") as file:\n", + " file.write(str(inf_req_res_1[0].output_text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Perform Finetuning on dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n", + "No small speculative model registered, using incremental decoding.\n", + "[0 - 7f4d49d21280] 29.957050 {3}{RequestManager}: [0] input: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30 8215 2053 1005 279 8834 304 872 305 12055 311 2567 1124 10409 449 4907 323 88000 369 1317 18852 315 892 13\n", + "[0 - 7f4d49d21280] 29.957061 {3}{RequestManager}: [0] output:\n", + "Loss: 2.6536\n", + "Loss: 2.5942\n", + "Loss: 2.5360\n", + "Loss: 2.5083\n", + "Loss: 2.4783\n", + "Loss: 2.4570\n", + "Loss: 2.4420\n", + "Loss: 2.4194\n", + "Loss: 2.4050\n", + "Loss: 2.3949\n", + "Loss: 2.3841\n", + "Loss: 2.3764\n", + "Loss: 2.3676\n", + "Loss: 2.3535\n", + "Loss: 2.3396\n", + "Loss: 2.3299\n", + "Loss: 2.3287\n", + "Loss: 2.3215\n", + "Loss: 2.3058\n", + "Loss: 2.2978\n", + "Loss: 2.2885\n", + "Loss: 2.2852\n", + "Loss: 2.2660\n", + "Loss: 2.2619\n", + "Loss: 2.2594\n", + "Loss: 2.2479\n", + "Loss: 2.2379\n", + "Loss: 2.2243\n", + "Loss: 2.2245\n", + "Loss: 2.2057\n", + "Loss: 2.2035\n", + "Loss: 2.1891\n", + "Loss: 2.1817\n", + "Loss: 2.1703\n", + "Loss: 2.1592\n", + "Loss: 2.1548\n", + "Loss: 2.1383\n", + "Loss: 2.1321\n", + "Loss: 2.1179\n", + "Loss: 2.1138\n", + "Loss: 2.1062\n", + "Loss: 2.0934\n", + "Loss: 2.0856\n", + "Loss: 2.0758\n", + "Loss: 2.0656\n", + "Loss: 2.0532\n", + "Loss: 2.0497\n", + "Loss: 2.0410\n", + "Loss: 2.0258\n", + "Loss: 2.0161\n", + "Loss: 2.0047\n", + "Loss: 1.9940\n", + "Loss: 1.9820\n", + "Loss: 1.9737\n", + "Loss: 1.9614\n", + "Loss: 1.9486\n", + "Loss: 1.9378\n", + "Loss: 1.9281\n", + "Loss: 1.9174\n", + "Loss: 1.9047\n", + "Loss: 1.8922\n", + "Loss: 1.8798\n", + "Loss: 1.8674\n", + "Loss: 1.8574\n", + "Loss: 1.8485\n", + "Loss: 1.8301\n", + "Loss: 1.8213\n", + "Loss: 1.8091\n", + "Loss: 1.8007\n", + "Loss: 1.7850\n", + "Loss: 1.7784\n", + "Loss: 1.7606\n", + "Loss: 1.7496\n", + "Loss: 1.7320\n", + "Loss: 1.7216\n", + "Loss: 1.7067\n", + "Loss: 1.6954\n", + "Loss: 1.6781\n", + "Loss: 1.6667\n", + "Loss: 1.6551\n", + "Loss: 1.6425\n", + "Loss: 1.6272\n", + "Loss: 1.6096\n", + "Loss: 1.6030\n", + "Loss: 1.5824\n", + "Loss: 1.5724\n", + "Loss: 1.5558\n", + "Loss: 1.5399\n", + "Loss: 1.5266\n", + "Loss: 1.5109\n", + "Loss: 1.4952\n", + "Loss: 1.4829\n", + "Loss: 1.4648\n", + "Loss: 1.4496\n", + "Loss: 1.4360\n", + "Loss: 1.4154\n", + "Loss: 1.4010\n", + "Loss: 1.3958\n", + "Loss: 1.3719\n", + "Loss: 1.3562\n", + "[0 - 7f4ce0190740] 38.933268 {3}{RequestManager}: [Finetuning] guid(1000001) completed_training_steps(100) processed_finetuning_tokens(3400) latency(38933176.0)\n" + ] + } + ], + "source": [ + "finetuning_request = ff.Request(\n", + " ff.RequestType.REQ_FINETUNING,\n", + " max_sequence_length=configs.max_sequence_length,\n", + " peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),\n", + " dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset),\n", + " max_training_steps=configs.max_training_steps,\n", + ")\n", + "ft_res = llm.generate([finetuning_request])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAABm/UlEQVR4nO3de1yUdfr/8fcICKKioqgIJKaVHe1gBw94KA9ZmYpKiqVW+3VLLcnd2tq21O1gWdtWW1m2pZ3QjDTL7UQlHlK3rNztaG1KKmIeERVFGu7fH/dvBoaZYQ4MzAzzej4ePMa5577v+TB+UC8/13V9LIZhGAIAAAAAuNUk2AMAAAAAgFBH4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBACNUEFBgSwWiwoKCoI9lIi3aNEiWSwWbdq0KdhD8cpPP/2kIUOGqFWrVrJYLHrrrbeCPSS/FBYWymKx6NFHHw32UAA0EgROABq1cPhH6znnnKOTTjpJhmG4PadPnz7q0KGDfvvttwYcWfiYPXu2LBaLOnTooLKyMqfX09PTddVVVwVhZOFn0qRJ+vrrr/XAAw/olVdeUc+ePV2eZwtM3H099NBDDTxyAKhf0cEeAABEugkTJujOO+/U2rVr1a9fP6fXCwsLtWHDBk2fPl3R0fyxXZs9e/Zo/vz5+sMf/hDsoYSlY8eOacOGDbr77rs1ffp0r64ZP368rrjiCqfj5513XqCHBwBBxd/AABBk2dnZuuuuu5Sbm+sycFq8eLEMw9CECROCMLrwcu655+qRRx7R1KlT1axZs2APp0EdPXpUzZs3r9M99u7dK0lq3bq119ecf/75uvbaa+v0vgAQDkjVAwBJX331lYYNG6aEhAS1aNFCl112mTZu3OhwTkVFhebMmaNTTjlFcXFxatu2rfr27av8/Hz7Obt379b111+v1NRUxcbGKjk5WSNGjFBhYaHb905LS1O/fv2Ul5eniooKp9dzc3PVtWtXXXzxxfrll180depUnXbaaWrWrJnatm2rsWPH1np/m/T0dE2ePNnp+IABAzRgwACHY+Xl5Zo1a5a6deum2NhYpaWl6Y477lB5eXmt7zF9+nS1aNHCZbrc+PHj1bFjR1mtVknSpk2bNHToULVr107NmjVTly5ddMMNN3j8Pmpz77336tdff9X8+fNrPc9dDZgt/WzRokX2Y5MnT1aLFi20fft2XXXVVWrRooVSUlL09NNPS5K+/vprXXrppWrevLk6d+6s3Nxcl+9ZVlam3//+92rbtq0SEhI0ceJEHTx40Om89957TxkZGWrevLlatmypK6+8Ut9++63DObYx/fzzz7riiivUsmVLj4G1pzk+e/Zsde7cWZJ0++23y2KxKD09vdZ7esuWKvnhhx/q3HPPVVxcnM444wwtW7bM6dytW7dq7NixSkxMVHx8vC655BL961//cjrv+PHjmj17tk499VTFxcUpOTlZmZmZ+vnnn53OXbBggbp27arY2FhdeOGF+vzzzx1e9+fnFkDkYcUJQMT79ttvlZGRoYSEBN1xxx2KiYnRc889pwEDBmj16tW6+OKLJZn/sJw7d65+97vf6aKLLlJpaak2bdqkL7/8UoMHD5YkjR49Wt9++61uueUWpaena8+ePcrPz9f27dtr/UfohAkTNGXKFH3wwQcOtThff/21vvnmG917772SpM8//1zr16/XuHHjlJqaqsLCQs2fP18DBgzQd999p/j4+Dp/HpWVlbr66qu1bt06TZkyRaeffrq+/vpr/f3vf9ePP/5Ya7OAa665Rk8//bT+9a9/aezYsfbjZWVleueddzR58mRFRUVpz549GjJkiJKSknTnnXeqdevWKiwsdPkPaV9kZGTo0ksv1bx583TzzTcHbNXJarVq2LBh6tevn+bNm6fXXntN06dPV/PmzXX33XdrwoQJyszM1LPPPquJEyeqV69e6tKli8M9pk+frtatW2v27NnasmWL5s+fr19++cUexEnSK6+8okmTJmno0KF6+OGHVVZWpvnz56tv37766quvHObQb7/9pqFDh6pv37569NFHa/2992aOZ2ZmqnXr1rrtttvs6XctWrTw+NmUlZVp3759Tsdbt27tkFr6008/6ZprrtFNN92kSZMmaeHChRo7dqzef/99+8/Pr7/+qt69e6usrEy33nqr2rZtq5deeklXX3218vLyNGrUKPvvx1VXXaWPP/5Y48aN04wZM3T48GHl5+frm2++UdeuXe3vm5ubq8OHD+v3v/+9LBaL5s2bp8zMTG3dulUxMTGS/P+5BRBhDABoxBYuXGhIMj7//HO354wcOdJo2rSp8fPPP9uP7dq1y2jZsqXRr18/+7EePXoYV155pdv7HDx40JBkPPLIIz6P88CBA0ZsbKwxfvx4h+N33nmnIcnYsmWLYRiGUVZW5nTthg0bDEnGyy+/bD+2atUqQ5KxatUq+7HOnTsbkyZNcrq+f//+Rv/+/e3PX3nlFaNJkybG2rVrHc579tlnDUnGp59+6vb7qKysNFJSUozRo0c7HF+6dKkhyVizZo1hGIaxfPlyj78vvpg1a5Yhydi7d6+xevVqQ5Lx2GOP2V/v3Lmzw++dq8/HMAxj27ZthiRj4cKF9mOTJk0yJBkPPvig/djBgweNZs2aGRaLxViyZIn9+A8//GBIMmbNmmU/ZpuDF1xwgXHixAn78Xnz5hmSjBUrVhiGYRiHDx82Wrdubfzf//2fw5h2795ttGrVyuG4bUx33nmnV5+Pt3Pc9v17M4dt57r72rBhg/3czp07G5KMN998037s0KFDRnJysnHeeefZj+Xk5BiSHObe4cOHjS5duhjp6emG1Wo1DMMwXnzxRaffY5vKykqH8bVt29Y4cOCA/fUVK1YYkox33nnHMIy6/dwCiCyk6gGIaFarVR9++KFGjhypk08+2X48OTlZ2dnZWrdunUpLSyWZ/4P+7bff6qeffnJ5r2bNmqlp06YqKChwmYJVmzZt2uiKK67Q22+/raNHj0qSDMPQkiVL1LNnT5166qn297CpqKjQ/v371a1bN7Vu3VpffvmlT+/pzhtvvKHTTz9d3bt31759++xfl156qSRp1apVbq+1WCwaO3as3n33XR05csR+/PXXX1dKSor69u0rqaqGZuXKlS7TE+uiX79+GjhwoObNm6djx44F7L6/+93v7L9u3bq1TjvtNDVv3lxZWVn246eddppat26trVu3Ol0/ZcoU+wqHJN18882Kjo7Wu+++K0nKz89XSUmJxo8f7/C5R0VF6eKLL3b5ud98880ex+3LHPfHlClTlJ+f7/R1xhlnOJzXqVMn+4qRJHu64ldffaXdu3dLkt59911ddNFF9nkiSS1atNCUKVNUWFio7777TpL05ptvql27drrlllucxmNbvbO55ppr1KZNG/vzjIwMSbL/HtXl5xZAZCFwAhDR9u7dq7KyMp122mlOr51++umqrKzUjh07JEl//etfVVJSolNPPVVnn322br/9dv33v/+1nx8bG6uHH35Y7733njp06GBP67L9o9CTCRMm6OjRo1qxYoUkaf369SosLHSoXTl27JjuvfdepaWlKTY2Vu3atVNSUpJKSkp06NChunwUdj/99JO+/fZbJSUlOXzZgrc9e/bUev0111yjY8eO6e2335YkHTlyRO+++67Gjh1r/0dt//79NXr0aM2ZM0ft2rXTiBEjtHDhQo81VN6aPXu2du/erWeffTYg94uLi1NSUpLDsVatWik1NdXpH+qtWrVy+Q/wU045xeF5ixYtlJycbK+jsQXkl156qdNn/+GHHzp97tHR0UpNTfU4dl/muD9OOeUUDRo0yOkrISHB4bxu3bo5fVa2OWX7DH755Re347S9Lkk///yzTjvtNK+6TJ500kkOz21BlO33qK4/twAiB4ETAHipX79++vnnn/Xiiy/qrLPO0j//+U+df/75+uc//2k/JycnRz/++KPmzp2ruLg43XPPPTr99NP11Vdfebz/VVddpVatWtmbC+Tm5ioqKkrjxo2zn3PLLbfogQceUFZWlpYuXaoPP/xQ+fn5atu2rSorK2u9f81/tNrYmjXYVFZW6uyzz3a5ipCfn6+pU6fW+j6XXHKJ0tPTtXTpUknSO++8o2PHjumaa65xGEteXp69zXpRUZFuuOEGXXDBBQ4rVf7q16+fBgwY4HbVydvPwiYqKsqn40Yte3K5Y/v9e+WVV1x+7raA2iY2NlZNmvDXuCfe/B7V5ecWQOSgOQSAiJaUlKT4+Hht2bLF6bUffvhBTZo0UVpamv1YYmKirr/+el1//fU6cuSI+vXrp9mzZzukcXXt2lV/+MMf9Ic//EE//fSTzj33XP3tb3/Tq6++WutYYmNjNWbMGL388sv69ddf9cYbb+jSSy9Vx44d7efk5eVp0qRJ+tvf/mY/dvz4cZWUlHj8Xtu0aePyvF9++cUhhatr1676z3/+o8suu8xtgOFJVlaWnnjiCZWWlur1119Xenq6LrnkEqfzLrnkEl1yySV64IEHlJubqwkTJmjJkiUOn6e/Zs+erQEDBui5555zes226lDz87CtaNSHn376SQMHDrQ/P3LkiIqLi+17INkaGrRv316DBg0K2Pv6Osfry//+9z8ZhuEwp3788UdJsjdg6Ny5s9tx2l6XzM/q3//+tyoqKhzSH+vC359bAJGD/6oCENGioqI0ZMgQrVixwqH18K+//qrc3Fz17dvXnnK0f/9+h2tbtGihbt262dPLysrKdPz4cYdzunbtqpYtW3qdgjZhwgRVVFTo97//vfbu3evUYjoqKsppNeMf//iH25WSmmPZuHGjTpw4YT+2cuVKpzStrKwsFRUV6fnnn3e6x7Fjx+w1WLW55pprVF5erpdeeknvv/++Qx2QZKZJ1fw+zj33XEly+Kx+/vlnl+2lvdG/f38NGDBADz/8sNPvS+fOnRUVFaU1a9Y4HH/mmWf8ei9vLFiwwKGea/78+frtt980bNgwSdLQoUOVkJCgBx980GXdl22PJV/5Msfr065du7R8+XL789LSUr388ss699xz7f85cMUVV+izzz7Thg0b7OcdPXpUCxYsUHp6ur1uavTo0dq3b5+eeuopp/fxdbUvED+3ACIDK04AIsKLL76o999/3+n4jBkzdP/99ys/P199+/bV1KlTFR0dreeee07l5eWaN2+e/dwzzjhDAwYM0AUXXKDExERt2rRJeXl5mj59uiTzf88vu+wyZWVl6YwzzlB0dLSWL1+uX3/91SHdrjb9+/dXamqqVqxYoWbNmikzM9Ph9auuukqvvPKKWrVqpTPOOEMbNmzQRx99pLZt23q89+9+9zvl5eXp8ssvV1ZWln7++We9+uqrDq2bJem6667T0qVLddNNN2nVqlXq06ePrFarfvjhBy1dulQffPCBevbsWet7nX/++erWrZvuvvtulZeXO6TpSdJLL72kZ555RqNGjVLXrl11+PBhPf/880pISLCvwEjSZZddJkl+76cza9Ysh1Uem1atWmns2LH6xz/+IYvFoq5du2rlypUe67fq4sSJE/b5sWXLFj3zzDPq27evrr76aklms4T58+fruuuu0/nnn69x48YpKSlJ27dv17/+9S/16dPHZaDgDW/nuD++/PJLl6syXbt2Va9evezPTz31VN144436/PPP1aFDB7344ov69ddftXDhQvs5d955pxYvXqxhw4bp1ltvVWJiol566SVt27ZNb775pj01ceLEiXr55Zc1c+ZMffbZZ8rIyNDRo0f10UcfaerUqRoxYoTX4w/Ezy2ACBHEjn4AUO9sraDdfe3YscMwDMP48ssvjaFDhxotWrQw4uPjjYEDBxrr1693uNf9999vXHTRRUbr1q2NZs2aGd27dzceeOABe4vpffv2GdOmTTO6d+9uNG/e3GjVqpVx8cUXG0uXLvVpzLfffrshycjKynJ67eDBg8b1119vtGvXzmjRooUxdOhQ44cffnBqNe6u3fbf/vY3IyUlxYiNjTX69OljbNq0yakduWEYxokTJ4yHH37YOPPMM43Y2FijTZs2xgUXXGDMmTPHOHTokFffx913321IMrp16+b02pdffmmMHz/eOOmkk4zY2Fijffv2xlVXXWVs2rTJ4bzOnTsbnTt39vhe1duR19S/f39DklMr+b179xqjR4824uPjjTZt2hi///3vjW+++cZlO/LmzZu7vO+ZZ57pdLxm63PbHFy9erUxZcoUo02bNkaLFi2MCRMmGPv373e6ftWqVcbQoUONVq1aGXFxcUbXrl2NyZMnO3w27sZUG2/meCDbkVefj7bP5IMPPjDOOeccIzY21ujevbvxxhtvON33559/NsaMGWO0bt3aiIuLMy666CJj5cqVTueVlZUZd999t9GlSxcjJibG6NixozFmzBh7y/XavhdVaxkfqJ9bAI2fxTD8qGAFAADwUnp6us466yytXLky2EMBAL9R4wQAAAAAHhA4AQAAAIAHBE4AAAAA4AE1TgAAAADgAStOAAAAAOABgRMAAAAAeBBxG+BWVlZq165datmypSwWS7CHAwAAACBIDMPQ4cOH1alTJ/sm2+5EXOC0a9cupaWlBXsYAAAAAELEjh07lJqaWus5ERc4tWzZUpL54SQkJAR5NFJFRYU+/PBDDRkyRDExMcEeDsIE8wb+YN7AX8wd+IN5A3809LwpLS1VWlqaPUaoTcQFTrb0vISEhJAJnOLj45WQkMAfKvAa8wb+YN7AX8wd+IN5A38Ea954U8JDcwgAAAAA8IDACQAAAAA8IHACAAAAAA8InAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAAAAAMADAicAAAAA8IDAKYisVmn1aovWrEnR6tUWWa3BHhEAAAAAVwicgmTZMik9XRo8OFqPPdZTgwdHKz3dPA4AAAAgtBA4BcGyZdKYMdLOnY7Hi4rM4wRPAAAAQGghcGpgVqs0Y4ZkGM6v2Y7l5Ii0PQAAACCEEDg1sLVrnVeaqjMMaccO8zwAAAAAoYHAqYEVFwf2PAAAAAD1j8CpgSUnB/Y8AAAAAPWPwKmBZWRIqamSxeL6dYtFSkszzwMAAAAQGgicGlhUlPTEE+avawZPtuePP26eBwAAACA0EDgFQWamlJcnpaQ4Hm/Z0jyemRmccQEAAABwjcApSDIzpcJCKT//N11xxVZJUuvW0siRwRwVAAAAAFcInIIoKkrq39/QpEnfKiHB0Pbt0po1wR4VAAAAgJoInEJAbGylxowxd7996aUgDwYAAACAEwKnEHHddZWSzBqno0eDPBgAAAAADgicQkTv3oZOPlk6ckRavjzYowEAAABQHYFTiLBYpIkTzV+TrgcAAACEFgKnEGILnD7+WNqxI7hjAQAAAFCFwCmEdOki9esnGYb02mvBHg0AAAAAm6AGTnPnztWFF16oli1bqn379ho5cqS2bNni8bqSkhJNmzZNycnJio2N1amnnqp33323AUZc/6qn6xlGcMcCAAAAwBTUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6WktbuRMnTmjw4MEqLCxUXl6etmzZoueff14pKSkNOPL6M3as1KyZ9MMP0rPPSosXSwUFktUa7JEBAAAAkSs6mG/+/vvvOzxftGiR2rdvry+++EL9+vVzec2LL76oAwcOaP369YqJiZEkpaen1/dQG0xCgtSzp7R2rTR1atXx1FTpiSekzMzgjQ0AAACIVEENnGo6dOiQJCkxMdHtOW+//bZ69eqladOmacWKFUpKSlJ2drb+9Kc/KSoqyun88vJylZeX25+XlpZKkioqKlRRURHg78B3tjHYHpcvt2jt2ihJFofziooMjRkjLVli1ahR5PBFuprzBvAG8wb+Yu7AH8wb+KOh540v72MxjNCopKmsrNTVV1+tkpISrVu3zu153bt3V2FhoSZMmKCpU6fqf//7n6ZOnapbb71Vs2bNcjp/9uzZmjNnjtPx3NxcxcfHB/R7qCurVZoyZYj2749TzcDJZKhdu2N67rl8uYgRAQAAAPigrKxM2dnZOnTokBISEmo9N2QCp5tvvlnvvfee1q1bp9TUVLfnnXrqqTp+/Li2bdtmX2F67LHH9Mgjj6i4uNjpfFcrTmlpadq3b5/HD6chVFRUKD8/X4MHD9b69U01eLDnRcD8/N/Uv39I/LYhSKrPG1vKKuAJ8wb+Yu7AH8wb+KOh501paanatWvnVeAUEql606dP18qVK7VmzZpagyZJSk5OVkxMjENa3umnn67du3frxIkTatq0qcP5sbGxio2NdbpPTExMSP0Qx8TEaO9e73479u6NVggNHUEUavMY4YF5A38xd+AP5g380VDzxpf3CGpXPcMwNH36dC1fvlyffPKJunTp4vGaPn366H//+58qKyvtx3788UclJyc7BU3hJjk5sOcBAAAACIygBk7Tpk3Tq6++qtzcXLVs2VK7d+/W7t27dezYMfs5EydO1F133WV/fvPNN+vAgQOaMWOGfvzxR/3rX//Sgw8+qGnTpgXjWwiojAyze57FVXmTzONpaeZ5AAAAABpOUAOn+fPn69ChQxowYICSk5PtX6+//rr9nO3btzvULqWlpemDDz7Q559/rnPOOUe33nqrZsyYoTvvvDMY30JARUWZLccl98HT44+LxhAAAABAAwtqjZM3fSkKCgqcjvXq1UsbN26shxEFX2amlJcnzZgh7dxZdbxFC+mll9jHCQAAAAiGoK44wbXMTKmwUFq1SrItpEVHS1dcEdRhAQAAABGLwClERUVJAwZI999v1j2VlEhvvx3sUQEAAACRicApxEVFSRMnmr9etCioQwEAAAAiFoFTGJg82Xz84AOpqCioQwEAAAAiEoFTGDjlFKlvX6myUnrllWCPBgAAAIg8BE5h4vrrzceFCyUvmhECAAAACCACpzAxdqwUHy/9+KPUSDuxAwAAACGLwClMtGwpjRlj/nrhwuCOBQAAAIg0BE5hxJaut2SJVFYW3LEAAAAAkSQ62AOA9/r1k7p0kbZtkx54QDrrLCk5WcrIMNuWAwAAAKgfBE5hpEkT6aKLzMDpwQerjqemSk88IWVmBm9sAAAAQGNGql4YWbZMWrrU+XhRkVn/tGxZw48JAAAAiAQETmHCapVmzHDditx2LCfHPA8AAABAYBE4hYm1a6WdO92/bhjSjh3meQAAAAACi8ApTBQXB/Y8AAAAAN4jcAoTycmBPQ8AAACA9wicwkRGhtk9z2Jxf05amnkeAAAAgMAicAoTUVFmy3HJffA0Ywb7OQEAAAD1gcApjGRmSnl5UkqK4/G4OPPxqaekPXukggJp8WLzkS57AAAAQN2xAW6YycyURowwu+cVF5s1TWeeKV1yibR1q3TSSVJ5edX5bI4LAAAA1B0rTmEoKkoaMEAaP958TEqSpk0zX6seNElsjgsAAAAEAoFTI2C1Sn//u+vX2BwXAAAAqDsCp0aAzXEBAACA+kXg1AiwOS4AAABQvwicGgE2xwUAAADqF4FTI8DmuAAAAED9InBqBLzZHPeee9gcFwAAAPAXgVMj4W5z3JgY8/HVV6UTJ9gcFwAAAPAHG+A2Iq42x+3YUbrwQmnNGnO/p9LSqvPZHBcAAADwDitOjUzNzXG7d5duuMF8rXrQJLE5LgAAAOAtAqdGzmp1HxixOS4AAADgHQKnRo7NcQEAAIC6I3Bq5NgcFwAAAKg7AqdGjs1xAQAAgLqjq14jZ9sct6ioqqapptRUqXdvs0W5rRtfRgb7PgEAAAA2BE6NnG1z3DFjzM1xXQVPMTHSySebwZUNrcoBAACAKqTqRQB3m+MmJUnR0dK2bY5Bk0SrcgAAAKA6AqcIkZkpFRZKq1ZJubnm486dUps2rs+nVTkAAABQhVS9CGLbHNemoEDau9f9+dVblVe/DgAAAIg0BE4RzNsW5EVFNI4AAABAZCNwimDetiC/7TbHlSkaRwAAACDSUOMUwWytyi2W2s+rmc5H4wgAAABEGgKnCGZrVS55Dp6qo3EEAAAAIg2BU4SrrVV5bao3jgAAAAAaO2qcoMxMacQIMwiyNYAoKpKuvdbztd42mAAAAADCGYETJLluVe4NbxtMAAAAAOGMVD245E3jCFtrcgAAAKCxI3CCS940jvjtN7POqaBAWrzYfKRZBAAAABojUvXglq1xxIwZ0s6dVcc7dZIqK6Xdu6Vu3RyDJfZ4AgAAQGPEihNqlZkpFRZKq1ZJubnm4/bt0pw55us1V5jY4wkAAACNEStO8Khm4wirVbrvPtfnGoaZ2peTY3bqi4pqiBECAAAA9YsVJ/hs7VrH1L2aqu/xZLVSAwUAAIDwx4oTfObt3k0rVkjXXecYZFEDBQAAgHDEihN85u3eTY8/7rwyRQ0UAAAAwlFQA6e5c+fqwgsvVMuWLdW+fXuNHDlSW7Zs8fr6JUuWyGKxaOTIkfU3SDjxZo8ndwzDfMzJkU6cII0PAAAA4SGogdPq1as1bdo0bdy4Ufn5+aqoqNCQIUN09OhRj9cWFhbqj3/8ozLYgbXB1bbHkzfBlK0GKjVVGjhQys42H9PTWYkCAABAaApq4PT+++9r8uTJOvPMM9WjRw8tWrRI27dv1xdffFHrdVarVRMmTNCcOXN08sknN9BoUZ1tj6eUFMfjqanmapI39u51fE4aHwAAAEJVSDWHOHTokCQpMTGx1vP++te/qn379rrxxhu1du3aWs8tLy9XeXm5/XlpaakkqaKiQhUVFXUccd3ZxhAKY/HV8OHSFVdI69ZZVFxs1j717Wto3TqLHn/c96lltjI3NGOGdMUVv9HKvBbhPG8QPMwb+Iu5A38wb+CPhp43vryPxTBsVSfBVVlZqauvvlolJSVat26d2/PWrVuncePGafPmzWrXrp0mT56skpISvfXWWy7Pnz17tubYdmutJjc3V/Hx8YEaPqqxWqUpU4Zo//44SX4UQkmaM2edmjSRDh6MU5s2x3XGGfsJpAAAABBQZWVlys7O1qFDh5SQkFDruSETON1888167733tG7dOqWmpro85/DhwzrnnHP0zDPPaNiwYZLkMXByteKUlpamffv2efxwGkJFRYXy8/M1ePBgxcTEBHs4AbN8uUXjxpmRjmFUD54MeRNMJSYaOnCg6ryUFEOPPWbVqFEhMV2DrrHOG9Qv5g38xdyBP5g38EdDz5vS0lK1a9fOq8ApJFL1pk+frpUrV2rNmjVugyZJ+vnnn1VYWKjhw4fbj1VWVkqSoqOjtWXLFnXt2tXhmtjYWMXGxjrdKyYmJqR+iENtPHWVlSVFR0szZji2JE9KsjjVNrlSPWiSpF27LBo3Llp5edKIEebmurb0wIwMRexqVGObN2gYzBv4i7kDfzBv4I+Gmje+vEdQAyfDMHTLLbdo+fLlKigoUJcuXWo9v3v37vr6668djv3lL3/R4cOH9cQTTygtLa0+hwsfZWY6Bzm9e0tdu5qNIHxZ6zTrn6QpU5yDMTbVBQAAQH0LauA0bdo05ebmasWKFWrZsqV2794tSWrVqpWaNWsmSZo4caJSUlI0d+5cxcXF6ayzznK4R+vWrSXJ6ThCQ1SUNGCA47EnnjC751ksvgdP+/c7H7d148vLI3gCAABA/QhqO/L58+fr0KFDGjBggJKTk+1fr7/+uv2c7du3q7i4OIijRKC5a2XuoZmiW2yqCwAAgPoW9FQ9TwoKCmp9fdGiRYEZDBqUqzQ+q1UaNMi/+1XfVLd6DRVpfAAAAAiEkGgOgchUM43PajUDHV/rn6pzt6kuaXwAAACoi6Cm6gHVRUWZq0OSWf8UCNXT+EjbAwAAgL8InBBS3NU/paZKbdv6F1DZ0vjWrg3MGAEAABB5SNVDyHFV/5SRIa1Y4V83PpuiIrNhBPs/AQAAwFcETghJrtqY21ajnDfVda5tcuW222gcAQAAAP+QqoewkpkpFRZKq1ZJubnm486dZhDkKY3PXeOIZcvqbbgAAABoJAicEHZsq1Hjx5uPTZv611SCxhEAAADwFoETGgV3TSWSkmq/ztY4oqCAjXMBAADgHjVOaDRcNZUoKpKuvdbztVlZ0oEDVc+pfwIAAEB1BE5oVGo2lSgo8O666kGTxMa5AAAAcESqHhq1jAzvGkfUVL3+6cQJ0vgAAAAiHStOaNSiosyUO3/2f7LVP6Wmum9jbrU67zfF3lAAAACNDytOaPTcNY5ITPTuendtzO+4Q0pPlwYOlLKzzcf0dNqbAwAANEasOCEiuGocYbVKgwb5fi/bqtUjjzi/VrM2ihUpAACAxoHACRGjZuMIq9VMuysq8i2FrzaGYaYE5uRIlZXSbbeZG/Ta0K0PAAAgPJGqh4hlq3+SfG8eURtbbdTYsY5Bk1S1IkU6HwAAQHghcEJE83fjXH9V79ZHdz4AAIDwQeCEiJeZKRUWSqtWSbm55uPOnf61MfeGbUVq7drA3xsAAAD1gxonQM71T5L/bcy9VVRk7gtF4wgAAIDQx4oT4Ia7NL60NOn2282AquaKlC8rVLfdRitzAACAcEHgBNTCVRrftm3SvHmug6rUVGnpUu/S/NztD0XwBAAAEHpI1QM8cJXGJ7neG8qWbhcV5XuaX/VW5lddJa1fTxofAABAqCBwAuqgtqAqL0+aMcOxJXlSkvNKU3W2xhGpqY7nsf8TAABAcJGqB9QTV2l+f/+7d9eSxgcAABBaWHEC6lHNFamCAv/uUzONb80ai9asSVHz5hYNHEgaHwAAQH0jcAIaUEaGmXZXVOR7i3PHNL5oST312GOOaXxWq+uaKwAAANQNqXpAA4qKMoMcyf/Ndd2l8d1xh9nSnBbnAAAAgUfgBDQwd/tDJSX5dz/DML8eecSxEYVEbRQAAECgEDgBQeCqccTOnd7t/+QLWzpgTo6ZxgcAAAD/UOMEBImrVuZPPOH7/k+e2Gqj1q513TodAAAAnrHiBISQQKfxVVdcXPd7AAAARCpWnIAQk5kpjRjh2B2vd2+pa1f/uvHZtG9vtkOn4x4AAIDvCJyAEBToNL64OGnSJDPwsqnexhwAAAC1I1UPCBPu0vjS0qTbbzcDKneNJY4fdwyaJMeOe1aruRq1eLH5WL2RRG2vAQAARApWnIAwYkvjW7XqN7333mYNG3auBg6MVlSUdMkl0owZji3JU1OlkhLpyBHnexmGGWhNmeL6Ott+U+5eY6UKAABEEgInIMxERUn9+xs6erRI/fv3sNcpuaqNslqlQYPc38swpP37nY8XFUmjR7u+xrZSlZdH8AQAACIHgRPQiNSsjVq82L/71FZDZVupyskxAzUaTAAAgEhAjRPQiCUn1899q+8NRQ0UAACIBAROQCOWkWHWJLlrGlFXK1ZI6enSwIFSdrb5mJ5uNpwAAABoTAicgEYsKqqqyUN9BE+PP+7YOEJy7NYHAADQWBA4AY2cuzbmqalS27aBD6hs9VE5OdKJE6TxAQCAxoHmEEAEcNVxLyPDTLVztalu9ee1veaOrQYqNVXau7fqOK3MAQBAuGLFCYgQto5748ebj1FRta9Gvfmm+eXqtZwc796zetAkkcYHAADCFytOQIRztxplazPu6rW1a836Jl/RyhwAAIQrAicATvs/eXrN1q2vqMhz2l5N1VuZu3tPAACAUEPgBMBntm59ruqjvFVUZDaMcLXKZbW6XwEDAAAIBgInAH6x1UfNmOHYkjwpybm2yZXbbnPdOEJyvidNJQAAQLAROAHwm6v6qN69pa5dPafxuWocMXq063NtTSXy8gieAABAcNBVD0Cd1OzW17Spf5vu1hZkVd8bir2gAABAMBA4AQg4d23Ok5L8v2f1phIAAAANjVQ9APXCVRpfUZF07bV1u29xcWDGBwAA4AsCJwD1pmYr84KCut+zfXu68QEAgIZH4ASgwdRl/ydJio6WrrvOcdWJbnwAAKAhUOMEoMHY9n+SnBtHVH/urqnEb785p+rZuvGNHu0YNNleGzNGWrasbuMGAAAIauA0d+5cXXjhhWrZsqXat2+vkSNHasuWLbVe8/zzzysjI0Nt2rRRmzZtNGjQIH322WcNNGIAdeWucURqqvTmm+aXq9dat3Z9P7rxAQCAhhDUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6evSo22sKCgo0fvx4rVq1Shs2bFBaWpqGDBmioqKiBhw5gLrIzJQKC6VVq6TcXPNx2zbzuKvXFi2SSkr8ey+68QEAgEAIao3T+++/7/B80aJFat++vb744gv169fP5TWvvfaaw/N//vOfevPNN/Xxxx9r4sSJ9TZWAIFVs3FEba8tXlz39ysqct9UAgAAwJOQag5x6NAhSVJiYqLX15SVlamiosLtNeXl5SovL7c/Ly0tlSRVVFSooqKiDqMNDNsYQmEsCB+RNm+Skiyq6x9XOTmG9u2rKp5KSTH02GNWjRrlR5eKMBVp8waBw9yBP5g38EdDzxtf3sdiGP70tgq8yspKXX311SopKdG6deu8vm7q1Kn64IMP9O233youLs7p9dmzZ2vOnDlOx3NzcxUfH1+nMQNoGFarNGXKEO3fHyfJVecI2x9jbrpKuHzdPPanP32uiy4q1nfftdXBg3Fq0+a4zjhjP6tRAABEgLKyMmVnZ+vQoUNKSEio9dyQCZxuvvlmvffee1q3bp1SU1O9uuahhx7SvHnzVFBQoHPOOcflOa5WnNLS0rRv3z6PH05DqKioUH5+vgYPHqyYmJhgDwdhIhLnzfLlFo0bZ0YzhlEVAFkshr0JhMXi+JqngMpiMZSYKMXFSUVFjX81KhLnDQKDuQN/MG/gj4aeN6WlpWrXrp1XgVNIpOpNnz5dK1eu1Jo1a7wOmh599FE99NBD+uijj9wGTZIUGxur2NhYp+MxMTEh9UMcauNBeIikeZOVZe7j5LxXk0WPP27+uuZrSUkW7d3r/p6GYdH+/c7Hd+2yaNy4aOXlNc49oCJp3iCwmDvwB/MG/mioeePLewQ1cDIMQ7fccouWL1+ugoICdenSxavr5s2bpwceeEAffPCBevbsWc+jBBAqMjOlESPMDnmumjzUfK2oSLr2Wt/fxzDM1aucHPOepO0BAICgBk7Tpk1Tbm6uVqxYoZYtW2r37t2SpFatWqlZs2aSpIkTJyolJUVz586VJD388MO69957lZubq/T0dPs1LVq0UIsWLYLzjQBoML504yso8P99bG3MCwrM+9KNDwCAyBbUfZzmz5+vQ4cOacCAAUpOTrZ/vf766/Zztm/fruLiYodrTpw4oTFjxjhc8+ijjwbjWwAQwjIyzM1zLe56RnghK0saOFDKzjYf09OlZcsCNkQAABAmgp6q50lBjf8yLiwsrJ/BAGh0oqKkJ56QxoyxNY7w/R4HDjg+Lyoy79dY658AAIBrQV1xAoD6lplpBjkpKY7HU1Oltm19X42yBV85OWabdAAAEBkInAA0epmZUmGhtGqVlJtrPhYWSgsWmK/7Ezzt2GE2orBazTqoxYvNR4IpAAAap5BoRw4A9c1VUwnbalTNNuaJic4peq6sWCFdd13N9uhmemBmphlEuesACAAAwguBE4CI5qrFudUqDRrk+Vrb/lHV2Wqg/vhHcxXKXVAFAADCC4ETgIhXczXKajWDnKIi3xtK2M5/5BHn12gsAQBA+KLGCQBqsHXjk5zrn+rS2pzGEgAAhC8CJwBwobZufDk5/t+3emMJAAAQPgicAMANV934tm0za6Lqqtq+3gAAIAxQ4wQAtXDVjS8jw/8aKJv27c325XTcAwAgPBA4AYCPbDVQY8aYNU++Bk9xcdKkSWbgZUPHPQAAQhupegDgB3c1UGlp0u23mwGVu0YSx487Bk1SVce9ZcvYVBcAgFDEihMA+MnVHlC2lLtLLnHeWDc1VSopkY4ccb6XYZiB1pQprq9jNQoAgOAicAKAOnBVAyX5t7GuYUj79zsfZ/8nAACCj8AJAOpJzaBq8WL/7mNbjcrJMYMxmkgAANDwCJwAoIEkJ/t/rW3/p4ICM3CiGx8AAA2LwAkAGkgg2phnZUkHDlQ9r17/ZLW6rrcCAAB1R1c9AGggtjbmkvuOe55UD5qkqvqnO+6Q0tOlgQOl7GzzMT3d7NIHAADqjsAJABqQuzbmqalS27a+B1SGYX498ohjJz7JscU5AACoGwInAGhgmZlSYaG0apWUm2s+FhZKCxaYr/u7GlWTLR0wJ0c6cUJavdqiNWtStHq1hb2hAADwETVOABAErtqY21ajau7jlJjonKLnLVtTidRUae/eaEk99dhj7A0FAICvWHECgBDiajVq6dK633fvXsfnpPEBAOCbOq04HT9+XHFxcYEaCwBAzqtRVmvdu/HVxN5QAAD4xucVp8rKSt13331KSUlRixYttHXrVknSPffcoxdeeCHgAwSASBeIbnyuVN8bqqDA3KC3oEDUPwEA4ILPgdP999+vRYsWad68eWratKn9+FlnnaV//vOfAR0cAMDkrhtfWpp0++1mQOVvUJWVRRtzAAA88Tlwevnll7VgwQJNmDBBUdVyO3r06KEffvghoIMDAFRxVf+0bZs0b57roCopybv7utsbiuAJAIAqPtc4FRUVqVu3bk7HKysrVVFREZBBAQBcc9WNTzKDqhEjpLVrpeJiKTlZ6t1b6trV99qomvVPkuN9MzKoiQIARB6fA6czzjhDa9euVefOnR2O5+Xl6bzzzgvYwAAAvnEVVD3xhLl6ZLH4Hjzt2CE98ID0/POO7dFpZQ4AiEQ+B0733nuvJk2apKKiIlVWVmrZsmXasmWLXn75Za1cubI+xggA8FNd94aaNcv5mC2VLy+P4AkAEDl8rnEaMWKE3nnnHX300Udq3ry57r33Xn3//fd65513NHjw4PoYIwCgDmy1Ufn5v2nmzE3Kz/+tTntD2VaucnKkEyfoyAcAiAx+7eOUkZGh/Pz8QI8FAFBPoqKk/v0NHT1apP79e6hJk7rtDWVL5UtNddxclzQ+AEBj5fOKEwAg/NW2N5Qvbc2rB00SHfkAAI2Xz4FTkyZNFBUV5fYLABAe3O0NlZoqzZnj3z1J4wMANFY+p+otX77c4XlFRYW++uorvfTSS5rj79+0AICgcNXGPCPDfO355/1L5SONDwDQGPkcOI2wbepRzZgxY3TmmWfq9ddf14033hiQgQEAGoa7vaH8bWVu4y6Nj258AIBwFLAap0suuUQff/xxoG4HAAgyd6l8SUn+3a96Gh9pewCAcONXV72ajh07pieffFIpNf92BQCENVepfL17S1271i2Nr6DAXOmqnh5ImSwAIJT5HDi1adNGlmotlwzD0OHDhxUfH69XX301oIMDAASfq1S+uqbxZWU5bsBbvf7JanWuuSKoAgAEm8+B09///neHwKlJkyZKSkrSxRdfrDZt2gR0cACA0GRL45sxQ9q5s+p4UpJzbZMr1YMmqar+6Y9/NLvwVb8nTSUAAKHA58Bp8uTJ9TAMAEC4CWQan+3cRx5xfo2mEgCAUOBV4PTf//7X6xuec845fg8GABBe6iONrybDMO+VkyNddZW0fj1pfACAhudV4HTuuefKYrHI8PA3oMVikZVWSQAQ0dyl8SUmOqfoeYu9oQAAweZV4LRt27b6HgcAoBFxlcZntUqDBtXtvuwNBQAIFq8Cp86dO9f3OAAAjUzNND6r1Vwh8qeNuTvV0/hGjCBtDwBQf/zex+m7777T9u3bdeLECYfjV199dZ0HBQBofKKiAl//JFWl8a1d61xvBQBAoPgcOG3dulWjRo3S119/7VD3ZGtRTo0TAMAdd/VPaWnSuHHSo4+az/0JqoqKzI11aRwBAKgPTXy9YMaMGerSpYv27Nmj+Ph4ffvtt1qzZo169uypgoKCehgiAKAxycyUCgulVauk3Fzzcds2ad48M6hKSXE8PynJu/vedps0cKCUnW0+pqdLy5YFevQAgEjl84rThg0b9Mknn6hdu3Zq0qSJmjRpor59+2ru3Lm69dZb9dVXX9XHOAEAjYirNuZS3faGonEEAKA++bziZLVa1bJlS0lSu3bttGvXLklmA4ktW7YEdnQAgIhjC6rGjzcfmzY1a6MkszbKW7YgKydHOnHCTONbvNh8JKscAOArn1eczjrrLP3nP/9Rly5ddPHFF2vevHlq2rSpFixYoJNPPrk+xggAiHDuaqOSkpxXmqpj/ycAQKD4HDj95S9/0dGjRyVJf/3rX3XVVVcpIyNDbdu21euvvx7wAQIAILlO4ysqkq691vO1pPEBAOrK68CpZ8+e+t3vfqfs7GwlJCRIkrp166YffvhBBw4cUJs2beyd9QAAqA81a6P87UnE/k8AAF95XePUo0cP3XHHHUpOTtbEiRMdOuglJiYSNAEAGlxGhpl2589fQbY0voIC6p8AAJ55HTi98MIL2r17t55++mlt375dl112mbp166YHH3xQRUVF9TlGAABcsm2qK/kXPElSVhZtzAEAnvnUVS8+Pl6TJ09WQUGBfvzxR40bN07PPfec0tPTdeWVV2qZj3/TzJ07VxdeeKFatmyp9u3ba+TIkV515nvjjTfUvXt3xcXF6eyzz9a7777r0/sCABoPW+MIf/d/OnDA8bmt/ongCQBQnc/tyG26du2q+++/X4WFhVq8eLE2btyosWPH+nSP1atXa9q0adq4caPy8/NVUVGhIUOG2JtPuLJ+/XqNHz9eN954o7766iuNHDlSI0eO1DfffOPvtwIACHOuNtXdudO/NL7qbcytVvOLVD4AgM9d9aorKCjQwoUL9eabbyo6Olr/93//59P177//vsPzRYsWqX379vriiy/Ur18/l9c88cQTuvzyy3X77bdLku677z7l5+frqaee0rPPPuvfNwIACHuuNtV94glz9chiqX3z3Jps9U8PPCA9/7xjC3RamQNAZPI5cNq5c6cWLVqkRYsWaevWrcrIyNAzzzyjsWPHqlmzZnUazKFDhySZzSbc2bBhg2bOnOlwbOjQoXrrrbdcnl9eXq7y8nL789LSUklSRUWFKioq6jTeQLCNIRTGgvDBvIE/InHeDB8uLVli0cyZUSoqqlp6Skw0dOCA56WoWbNs0VbVuUVFhsaMkZYssWrUKB+isTAWiXMHdce8gT8aet748j4Ww/Du/+CWLl2qF198UR9//LHat2+vSZMm6YYbblC3bt38Hmh1lZWVuvrqq1VSUqJ169a5Pa9p06Z66aWXNH78ePuxZ555RnPmzNGvv/7qdP7s2bM1Z84cp+O5ubmKj48PyNgBAKHNapW++66tDh6MU5s2x1VZKc2a1deLKw1VD5qqH2/X7pieey6fVuYAEMbKysqUnZ2tQ4cO2bdccsfrFadrr71WV155pZYvX64rrrhCTZr4XR7l0rRp0/TNN9/UGjT546677nJYoSotLVVaWpqGDBni8cNpCBUVFcrPz9fgwYMVExMT7OEgTDBv4I9InzfDh1f92mqVFiwwtGuXZBiuAyMzYHK3KmXRvn3xat78SkVFVW3I27ev0SgDqUifO/AP8wb+aOh5Y8tG84bXgdPOnTvVvn17vwbkyfTp07Vy5UqtWbNGqamptZ7bsWNHp5WlX3/9VR07dnR5fmxsrGJjY52Ox8TEhNQPcaiNB+GBeQN/MG+kmBjpySdd1z+Zz73rKJGdHe3Qla+x1z8xd+AP5g380VDzxpf38HrZqD6CJsMwNH36dC1fvlyffPKJunTp4vGaXr166eOPP3Y4lp+fr169egV8fACAxstdG/PUVMlFhrdLtDIHgMhRp656dTVt2jTl5uZqxYoVatmypXbv3i1JatWqlb3RxMSJE5WSkqK5c+dKkmbMmKH+/fvrb3/7m6688kotWbJEmzZt0oIFC4L2fQAAwlNmpjRihLR2bVW6XUaG+drzz5uBkK/d+CwWs5X5VVdJ69c73rcxpvEBQKQIauA0f/58SdKAGv1jFy5cqMmTJ0uStm/f7lBP1bt3b+Xm5uovf/mL/vznP+uUU07RW2+9pbPOOquhhg0AaERctTGX6t7KPDVV2ru36nhjT+MDgMYuqIGTNw39CgoKnI6NHTvW5812AQDwhS2Vb8YMx32cEhOdU/RcqR40SVVpfHl5BE8AEI58bo23Y8cO7az2N8hnn32mnJwcUuUAAI1OZqZUWCitWiXl5pqPS5f6dy/b/xXm5Jhd/QAA4cXnFafs7GxNmTJF1113nXbv3q3BgwfrzDPP1Guvvabdu3fr3nvvrY9xAgAQFDVT+axWM+3O1/onqSqNr6BADm3MqX8CgNDn84rTN998o4suukiSuSnuWWedpfXr1+u1117TokWLAj0+AABCSlSUWaskmfVP/sjKkgYOlLKzzcf0dDrxAUCo8zlwqqiosO+L9NFHH+nqq6+WJHXv3l3FxcWBHR0AACHIXSvzpCTvrqeNOQCEH58DpzPPPFPPPvus1q5dq/z8fF1++eWSpF27dqlt27YBHyAAAKHIVf3Tzp1mGp+vK1HUPwFA6PO5xunhhx/WqFGj9Mgjj2jSpEnq0aOHJOntt9+2p/ABABAJXLUyr2sbc+qfACA0+Rw4DRgwQPv27VNpaanatGljPz5lyhTFx8cHdHAAAISburYxz8pyPI/9nwAgNPicqnfs2DGVl5fbg6ZffvlFjz/+uLZs2aL27dsHfIAAAISburQxp/4JAEKTzytOI0aMUGZmpm666SaVlJTo4osvVkxMjPbt26fHHntMN998c32MEwCAsBKoNuaGYab95eRIV10lrV9PGh8ABIPPK05ffvmlMjIyJEl5eXnq0KGDfvnlF7388st68sknAz5AAAAag7q0MbfVP6Wm0sYcAILF58CprKxMLVu2lCR9+OGHyszMVJMmTXTJJZfol19+CfgAAQBoLNy1MU9M9O76vXsdn5PGBwANx+fAqVu3bnrrrbe0Y8cOffDBBxoyZIgkac+ePUpISAj4AAEAaEzqUv9UE23MAaDh+FzjdO+99yo7O1u33XabLr30UvXq1UuSufp03nnnBXyAAAA0NoGqf5JoYw4ADcXnwGnMmDHq27eviouL7Xs4SdJll12mUaNGBXRwAABEAlv9kz/7P9nQxhwA6pfPqXqS1LFjR5133nnatWuXdv7/TSouuugide/ePaCDAwAgUrirf0pK8u562pgDQP3yOXCqrKzUX//6V7Vq1UqdO3dW586d1bp1a913332qrKysjzECABARXNU/7dxprh7504lPMuufTpwwU/kWLzYfqYcCAN/5nKp3991364UXXtBDDz2kPn36SJLWrVun2bNn6/jx43rggQcCPkgAACJFzfonyf80vuptzKt35CONDwB85/OK00svvaR//vOfuvnmm3XOOefonHPO0dSpU/X8889r0aJF9TBEAAAiG23MASD4fA6cDhw44LKWqXv37jpQM8EaAAAEBG3MASC4fE7V69Gjh5566ik9+eSTDsefeuophy57AAAgsOqjjfnatWbr8rVraWUOALXxOXCaN2+errzySn300Uf2PZw2bNigHTt26N133w34AAEAgGuBaGO+YoV03XVmEwobaqAAwJnPqXr9+/fXjz/+qFGjRqmkpEQlJSXKzMzUli1blJGRUR9jBAAAbtS1jfnjjzsGTRI1UADgis8rTpLUqVMnp+55O3fu1JQpU7RgwYKADAwAAHgnM1MaMcIx3a53b6lrV//T+CwWswZqxIh6GTIAhB2/NsB1Zf/+/XrhhRcCdTsAAOADW/3T+PHmY9OmZrqd5LwHlDd7QtlqoAoKpNWrLVqzJkWrV1toJgEgYgUscAIAAKHFXRpfaqq5muSNrCxp8OBoPfZYTw0eHK30dFL4AEQmAicAABoxV23Mt23zPgWv5k4j1D8BiFR+1TgBAIDwUbONuWS2HPenlXnN+ifalgOIFF4HTpkeepKWlJTUdSwAAKCB1KWVefX6p6go9n8CEBm8DpxatWrl8fWJEyfWeUAAAKBh2GqgZsxwbEmemOicoudKVpbjeez/BKAx8zpwWrhwYX2OAwAABIGrVuZWqzRokOdr3dU/5eURPAFofKhxAgAgwtWsgbJaqX8CgJoInAAAgAPqnwDAGe3IAQCAE3d7QCUmend9VpY0cKCUnW0+sv8TgHBH4AQAAFyy7QGVn/+bZs7cpPz837R0qXfXsv8TgMaGVD0AAOBWVJTUv7+ho0eL1L9/DzVpUvf6p6uuktavJ40PQHghcAIAAF4LRP1Taqq0d2/VcdqYAwgHpOoBAACf1LX+qXrQJJHGByA8sOIEAAB8Vpf9n2oijQ9AOCBwAgAAfgnU/k8SaXwAQh+pegAAICBs9U+SuYLkD9L4AIQqAicAABAw7uqfkpL8u59t5Sonx1zRAoBgIXACAAABZdv/adUqKTfXfNy500y782clypbGt3ZtwIcKAF6jxgkAAARczfonyf825jbFxQEZGgD4hRUnAADQIOqaxte+vVRQIC1ebD6SugegIbHiBAAAGoyrNua9e0tdu9beja9FC2nyZDPlz4aOewAaEoETAABoUP6k8R05Yn5VZ+u4l5dH8ASg/pGqBwAAgs5dGl9KihQX5/qa6h33TpwgjQ9A/WLFCQAAhARXaXxWqzRokPtr2DgXQEMhcAIAACGjZhrf4sXeXedu41zS+AAECql6AAAgZCUn+3cdG+cCCDRWnAAAQMjKyDDT7mrruOeOLY2voMBcybKl/2VkmM8BwBcETgAAIGRFRdV949ysLOnAgarn1D8B8AepegAAIKTVdePc6kGTVFX/tGxZYMYHIDIQOAEAgJCXmSkVFkqrVkm5uebjzp3m6pHF4tu9atY/Wa20MgfgWVADpzVr1mj48OHq1KmTLBaL3nrrLY/XvPbaa+rRo4fi4+OVnJysG264Qfv376//wQIAgKCyddwbP958bNrUTLmT/AueduyQHnhASk+XBg6UsrPNx/R0VqMAOAtq4HT06FH16NFDTz/9tFfnf/rpp5o4caJuvPFGffvtt3rjjTf02Wef6f/+7//qeaQAACAUuUvjS0z07vpZs8yVq+pI5QPgSlCbQwwbNkzDhg3z+vwNGzYoPT1dt956qySpS5cu+v3vf6+HH364voYIAABCnD8b59bGMMwVrJwc87504AMghVlXvV69eunPf/6z3n33XQ0bNkx79uxRXl6errjiCrfXlJeXq7y83P68tLRUklRRUaGKiop6H7MntjGEwlgQPpg38AfzBv4Kl7nTp0/Vr61WKSUlWrt2SYbhKo/PkOQ+v8+Wyrdq1W/q39+PVn4Im3mD0NLQ88aX97EYhj+NPQPPYrFo+fLlGjlyZK3nvfHGG7rhhht0/Phx/fbbbxo+fLjefPNNxcTEuDx/9uzZmjNnjtPx3NxcxcfHB2LoAAAgBG3YkKyHH77w/z+rHiQZLo65lpOzSW3bHtfBg3Fq0+a4zjhjPytQQCNSVlam7OxsHTp0SAkJCbWeG1aB03fffadBgwbptttu09ChQ1VcXKzbb79dF154oV544QWX17hacUpLS9O+ffs8fjgNoaKiQvn5+Ro8eLDb4A+oiXkDfzBv4K9wnjvLl1s0c2aUioqqgqTUVEM33FCpv/7VcwTUrp2hffuqrk1JMfTYY1aNGhUS/3wKaeE8bxA8DT1vSktL1a5dO68Cp7BK1Zs7d6769Omj22+/XZJ0zjnnqHnz5srIyND999+v5ORkp2tiY2MVGxvrdDwmJiakfohDbTwID8wb+IN5A3+F49zJypJGj3asf8rIsEiK0osvmo0gavsv5OpBkyTt2mXRuHHRystzrqvKyKAeypVwnDcIvoaaN768R1gFTmVlZYqOdhxy1P//EypEFs4AAECIsbUxr+mJJ8zueRZL7cFTdbbGEVOmSDNmOHbkS00175mZGZBhAwgxQW1HfuTIEW3evFmbN2+WJG3btk2bN2/W9u3bJUl33XWXJk6caD9/+PDhWrZsmebPn6+tW7fq008/1a233qqLLrpInTp1Csa3AAAAwpS7VuZJSbVfZxjS/v20MQciTVBXnDZt2qSBAwfan8+cOVOSNGnSJC1atEjFxcX2IEqSJk+erMOHD+upp57SH/7wB7Vu3VqXXnop7cgBAIBfXLUyLyqSrr3W93vRxhxo3IIaOA0YMKDWFLtFixY5Hbvlllt0yy231OOoAABAJKmZyldQ4P+9bG3M1651nR4IIHwFNVUPAAAg1GRkmPVKFs/dyt0qLg7ceACEBgInAACAaqKizCYPkv/BU/v25srV4sXmo9UaqNEBCBYCJwAAgBrcNY5ITZXatq09oGrWTJo0SRo4UMrONh/T02kaAYQ7AicAAAAXMjOlwkJp1SopN9d8LCyUFiwwX3cXPB07ZjaYqI6Oe0D4C6t9nAAAABqSqz2gbKtRrvZxKimRjhxxvk/1jntXXSWtX8/GuUC4IXACAADwkas25larNGiQ+2tsHfdSU6W9e6uOs3EuEB4InAAAAPxQczVq8WLvrqseNElVaXx5eQRPQCijxgkAACAAkpP9u862pWVODt33gFBG4AQAABAAddn/qfrGuQBCE4ETAABAAARi/yc2zgVCF4ETAABAgLjb/ykpybvr2TgXCF00hwAAAAggVx33eveWunY1G0HYappqat5cmjzZucU5HfeA0EDgBAAAEGCu9n964gmze57F4jp4OnrU/KqOjntA6CBVDwAAoAG4S+NLTZXi411fU73j3okTpPEBwcSKEwAAQANh41wgfBE4AQAANCA2zgXCE6l6AAAAQcTGuUB4IHACAAAIIjbOBcIDgRMAAEAQBWLj3KIiGkcA9Y3ACQAAIMjqunHubbdJAwdK2dnmY3q6tGxZwIcJRDQCJwAAgBCQmSkVFkqrVkm5uebjzp3epfG5axxB8AQEDoETAABAiLB13Bs/3nxs2tS/ND4aRwCBR+AEAAAQwvxN47M1jigooP4JCAT2cQIAAAhxrjbOLSqSrr3W87VZWdKBA1XP2TgX8A+BEwAAQBiouXFuQYF311UPmiQ2zgX8RaoeAABAGPJ3/yfqnwD/EDgBAACEobrs/8TGuYDvCJwAAADClLvGEYmJ3l1fXBz4MQGNFTVOAAAAYcxV4wirVRo0yPO1tnOrX5uRYa5mAXBE4AQAABDmajaOsFrN+qeioqqaJleee0667jpzo10buu4BrpGqBwAA0MjUVv9U/fmSJY5Bk1TVdW/ZsvodIxBuCJwAAAAaIXf1T6mp0tKlUps2rq+r3nXvxAk2zwVsSNUDAABopFzVP2VkmM8PHnR/na3rXmqqtHdv1XHS+BDJCJwAAAAasZr1T5L33fSqB00Sm+cispGqBwAAEGGSk/27js1zEclYcQIAAIgwGRnedd1zxZbGV1BgrmbRxhyRgsAJAAAgwti67o0ZY3bZ8zV4kqSsLOnAgarn1D+hsSNVDwAAIAK567qXlOTd9dWDJok25mj8CJwAAAAiVGamVFgorVol5eaajzt3mqtHNfd/8oT6JzR2pOoBAABEMFdd9/xN46te/2QYFq1Zk6LmzS0aOJD6J4Q/VpwAAADgwF0aX2Kid9dnZUmDB0frscd6avDgaKWnk8KH8EfgBAAAACeu0viWLvXuWuqf0BiRqgcAAACXaqbxWa3+tTE3DDPtLydHGjGCtD2EJ1acAAAA4BVbG3PJv+YRO3ZIa9cGflxAQyBwAgAAgNfqWv9UVGQ2j1i82HykAx/CBal6AAAA8Elmpplyt3atVFwsJSebAdCgQZ6vve02ae/equdsnItwQeAEAAAAn/lb/1Q9aJKqGkfk5RE8IbSRqgcAAIA687f+iY1zES4InAAAABAQ7uqfkpJqv47GEQgHBE4AAAAIGNv+T/n5v2nmzE3Kz/9Nf/+7d9cWF9fr0IA6ocYJAAAAARUVJfXvb+jo0SL1799Dn37q3XXt25ud9mwNJzIy2PMJoYPACQAAAPUqI8Nz44joaOm66xxXnei4h1BCqh4AAADqlTeNI377zTlVz9Zxb9kys3EE+z8hmAicAAAAUO/cNY5ITZVat3Z9jW11asoUKT1dGjhQys42H9PTzYAKaChBDZzWrFmj4cOHq1OnTrJYLHrrrbc8XlNeXq67775bnTt3VmxsrNLT0/Xiiy/W/2ABAABQJ7bGEatWSbm55uOiRVJJiftrDEPav1/audPxePXVKKAhBLXG6ejRo+rRo4duuOEGZXqZvJqVlaVff/1VL7zwgrp166bi4mJVVlbW80gBAAAQCDU3zl282L/7GIaZ9peTI40YQRMJ1L+gBk7Dhg3TsGHDvD7//fff1+rVq7V161YlJiZKktLT0+tpdAAAAKhvycn+X1t9/6fqwRhQH8Kqq97bb7+tnj17at68eXrllVfUvHlzXX311brvvvvUrFkzl9eUl5ervLzc/ry0tFSSVFFRoYqKigYZd21sYwiFsSB8MG/gD+YN/MXcgT+8nTeXXCKlpERr1y7JMNx0jvBgx47fVFHhpl0fwkpD/3njy/uEVeC0detWrVu3TnFxcVq+fLn27dunqVOnav/+/Vq4cKHLa+bOnas5c+Y4Hf/www8VHx9f30P2Wn5+frCHgDDEvIE/mDfwF3MH/vBm3lx7bbIefvhCSYak6sFTzeeubdu2UQ8/LB08GKc2bY7rjDP2k7oX5hrqz5uysjKvz7UYhrtu+g3LYrFo+fLlGjlypNtzhgwZorVr12r37t1q1aqVJGnZsmUaM2aMjh496nLVydWKU1pamvbt26eEhISAfx++qqioUH5+vgYPHqyYmJhgDwdhgnkDfzBv4C/mDvzh67xZvtyimTOjVFRUFSilpBg6dkw6eND9alTTpobatZN27XK87rHHrBo1KiT+mQsfNPSfN6WlpWrXrp0OHTrkMTYIqxWn5ORkpaSk2IMmSTr99NNlGIZ27typU045xema2NhYxcbGOh2PiYkJqT/8Q208CA/MG/iDeQN/MXfgD2/nTVaWNHq0Wa9UXGzWPmVkWLRihdk9z2JxvXnuiRMW7drleGzXLovGjYtWXh6b54arhvrzxpf3CKt9nPr06aNdu3bpyJEj9mM//vijmjRpotTU1CCODAAAAHVl67g3frz5GBVV+/5P7hYIbAFWTg4b5SJwgho4HTlyRJs3b9bmzZslSdu2bdPmzZu1fft2SdJdd92liRMn2s/Pzs5W27Ztdf311+u7777TmjVrdPvtt+uGG25w2xwCAAAA4c3d/k//v+eXS9U77lmtUkGB2fq8oIBgCv4Jaqrepk2bNHDgQPvzmTNnSpImTZqkRYsWqbi42B5ESVKLFi2Un5+vW265RT179lTbtm2VlZWl+++/v8HHDgAAgIbj7/5PK1ZI113nuIFuaqr0xBOk8cE3QQ2cBgwYoNp6UyxatMjpWPfu3enqAwAAEOG83f/p8cedjxUVmXVT1EDBF2FV4wQAAABIUkaGuXJk8WPrJ2qg4A8CJwAAAISdqCgz3U5yDp68Caaq10AB3iBwAgAAQFiqreNeTo539ygqonEEvBNW+zgBAAAA1WVmSiNG1Nz/yXzuqr6ppttuk/burXpO4wi4Q+AEAACAsFaz455UVQNVVOR641yb6kGTROMIuEeqHgAAABqd2mqgakPjCLhD4AQAAIBGyV0NVFJS7dfROAKukKoHAACARstVDVRRkXTttZ6vtTWOqF47FRVV70NGiCJwAgAAQKNWswaqoMC762gcgepI1QMAAEBE8XbzXHeNI5Ytq7+xIXQROAEAACCi0DgC/iBwAgAAQMShcQR8RY0TAAAAIhKNI+ALAicAAABELBpHwFuk6gEAAAD/H40j4A6BEwAAAPD/0TgC7hA4AQAAANXQOAKuUOMEAAAA1EDjCNRE4AQAAAC4QOMIVEeqHgAAAOAFGkdENgInAAAAwAs0johsBE4AAACAl2gcEbmocQIAAAB8QOOIyETgBAAAAPiIxhGRh1Q9AAAAoI5oHNH4ETgBAAAAdUTjiMaPwAkAAAAIgLo2jigoML8WLzYfCaRCCzVOAAAAQIDUpXFEVpZ04EDVc+qfQguBEwAAABBA/jaOqB40SVX1T3l5BE+hgFQ9AAAAoB552ziiJuqfQguBEwAAAFCP/G0cIbFxbighcAIAAADqmbvGEYmJ3l1fXBz4McE31DgBAAAADcBV4wirVRo0yPO17dubtVK26zIyzJUsNBwCJwAAAKCB1GwcYbWa9U9FRVU1Ta7QcS/4SNUDAAAAgqS2+qfqz9113Fu2rH7HhyoETgAAAEAQuat/SkmR2rZ1fQ0d9xoeqXoAAABAkPlT/2TruFdQYK5cUf9UvwicAAAAgBBQs/5p8WLvrqP+qWGQqgcAAACEoORk786j/qlhEDgBAAAAISgjw1w98mfTXIn6p0AjcAIAAABCUG0d9zyx1T+tXRv4cUUqAicAAAAgRLnruJeY6N31xcXmqlNBgVkzVVDAKpS/aA4BAAAAhDB/Ou7Z/PSTlJ4u7dxZdYzmEf4hcAIAAABCXM2Oe1arGQAVFVXVNLkya5bzMVvziLw8gidfkKoHAAAAhJna6p881UPRPMI/BE4AAABAGHJX/5SaKs2ZU/u1NI/wHYETAAAAEKYyM6XCQmnVKik313zctk065RTvri8urtfhNSrUOAEAAABhrGb9k+T95rnengcCJwAAAKDRsW2eW1vziE6dpN69zRbltm59GRlmIAZnBE4AAABAI2NrHjFmjNkswlXwdPSo1LmztHt31TFalbtHjRMAAADQCLlrHpGcLCUkSIcOOQZNUlWr8mXLGm6c4YLACQAAAGikXDWPKCyUmjd3fT6tyt0jVQ8AAABoxGo2j7DVNLlja1VeUGBeS/2TicAJAAAAiCDetiDPypIOHKh6Hun1T0FN1VuzZo2GDx+uTp06yWKx6K233vL62k8//VTR0dE699xz6218AAAAQGPjbQvy6kGT5Fj/ZLWaK1KLF5uPkZDWF9TA6ejRo+rRo4eefvppn64rKSnRxIkTddlll9XTyAAAAIDGydaq3GLx7Tpb/dOUKVJ6ujRwoJSdbT6mpzf+hhJBTdUbNmyYhg0b5vN1N910k7KzsxUVFeXTKhUAAAAQ6bxpVe6OYUj79zsft61G5eU13lS+sKtxWrhwobZu3apXX31V999/v8fzy8vLVV5ebn9eWloqSaqoqFBFRUW9jdNbtjGEwlgQPpg38AfzBv5i7sAfzJvQNny4tGSJRTNnRqmoqGrpKTHR0IEDPi5FyQyoLBZDM2ZIV1zxm99NJBp63vjyPmEVOP3000+68847tXbtWkVHezf0uXPnas6cOU7HP/zwQ8XHxwd6iH7Lz88P9hAQhpg38AfzBv5i7sAfzJvQFRsrPfmk9N13bXXwYJzatDmuykpp1qy+ft3PMCzauVN69NF/6+yzXSxL+aCh5k1ZWZnX54ZN4GS1WpWdna05c+bo1FNP9fq6u+66SzNnzrQ/Ly0tVVpamoYMGaKEhIT6GKpPKioqlJ+fr8GDBysmJibYw0GYYN7AH8wb+Iu5A38wb8LH8OFVv7ZapQULDO3aZQZC/ujc+RJdcYUP+X/VNPS8sWWjeSNsAqfDhw9r06ZN+uqrrzR9+nRJUmVlpQzDUHR0tD788ENdeumlTtfFxsYqNjbW6XhMTExI/RCH2ngQHpg38AfzBv5i7sAfzJvwEhNjrkL5U/9kk5YWrbr+ljfUvPHlPcImcEpISNDXX3/tcOyZZ57RJ598ory8PHXp0iVIIwMAAAAaj8xMs8nDjBnSzp1Vx1NTpWPHzDbl7gKq1FSza19jFNTA6ciRI/rf//5nf75t2zZt3rxZiYmJOumkk3TXXXepqKhIL7/8spo0aaKzzjrL4fr27dsrLi7O6TgAAAAA/2VmSiNGSGvXmhvmJiebAdGKFbWvRp10kpnuV/M6f5tFhJKgBk6bNm3SwIED7c9ttUiTJk3SokWLVFxcrO3btwdreAAAAEDEioqSBgxwPOZuNSopyVyJWr9eSkyUjh6tei011Wx/Hu5tyoO6Ae6AAQNkGIbT16JFiyRJixYtUkFBgdvrZ8+erc2bNzfIWAEAAACYAVBhobRqlZSbaz4WF0s5Oebr1YMmqWqPp3DfIDdsapwAAAAAhIaaq1FWq/T6667PNfd4MgOrESPCN20vqCtOAAAAAMLf2rWOqXs1GYa0Y4d5XrhixQkAAABAnRQXe3deUZFUUBCejSMInAAAAADUSXKyd+fl5Ej79lU9D6fGEaTqAQAAAKiTjAwzCLJYaj+vetAkhVfjCAInAAAAAHUSFWWuHEmeg6fqbHtB5eSYDSZCGYETAAAAgDqz7fGUkuJ4PCmp9uvCpXEENU4AAAAAAiIz02w5vnZtVQOIoiLp2ms9X+ttg4lgIXACAAAAEDA193gqKPDuOm8bTAQLqXoAAAAA6o2nxhEWi5SWZp4XygicAAAAANSb2hpH2J4//njo7+dE4AQAAACgXrlrHJGaah4Ph32cqHECAAAAUO9cNY7IyAj9lSYbAicAAAAADaJm44hwQqoeAAAAAHhA4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAeEDgBAAAAgAcETgAAAADgAYETAAAAAHgQHewBNDTDMCRJpaWlQR6JqaKiQmVlZSotLVVMTEywh4MwwbyBP5g38BdzB/5g3sAfDT1vbDGBLUaoTcQFTocPH5YkpaWlBXkkAAAAAELB4cOH1apVq1rPsRjehFeNSGVlpXbt2qWWLVvKYrEEezgqLS1VWlqaduzYoYSEhGAPB2GCeQN/MG/gL+YO/MG8gT8aet4YhqHDhw+rU6dOatKk9iqmiFtxatKkiVJTU4M9DCcJCQn8oQKfMW/gD+YN/MXcgT+YN/BHQ84bTytNNjSHAAAAAAAPCJwAAAAAwAMCpyCLjY3VrFmzFBsbG+yhIIwwb+AP5g38xdyBP5g38Ecoz5uIaw4BAAAAAL5ixQkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMApiJ5++mmlp6crLi5OF198sT777LNgDwkhZO7cubrwwgvVsmVLtW/fXiNHjtSWLVsczjl+/LimTZumtm3bqkWLFho9erR+/fXXII0Yoeihhx6SxWJRTk6O/RjzBu4UFRXp2muvVdu2bdWsWTOdffbZ2rRpk/11wzB07733Kjk5Wc2aNdOgQYP0008/BXHECDar1ap77rlHXbp0UbNmzdS1a1fdd999qt57jHkDSVqzZo2GDx+uTp06yWKx6K233nJ43Zt5cuDAAU2YMEEJCQlq3bq1brzxRh05cqTBvgcCpyB5/fXXNXPmTM2aNUtffvmlevTooaFDh2rPnj3BHhpCxOrVqzVt2jRt3LhR+fn5qqio0JAhQ3T06FH7ObfddpveeecdvfHGG1q9erV27dqlzMzMII4aoeTzzz/Xc889p3POOcfhOPMGrhw8eFB9+vRRTEyM3nvvPX333Xf629/+pjZt2tjPmTdvnp588kk9++yz+ve//63mzZtr6NChOn78eBBHjmB6+OGHNX/+fD311FP6/vvv9fDDD2vevHn6xz/+YT+HeQNJOnr0qHr06KGnn37a5evezJMJEybo22+/VX5+vlauXKk1a9ZoypQpDfUtSAaC4qKLLjKmTZtmf261Wo1OnToZc+fODeKoEMr27NljSDJWr15tGIZhlJSUGDExMcYbb7xhP+f77783JBkbNmwI1jARIg4fPmyccsopRn5+vtG/f39jxowZhmEwb+Den/70J6Nv375uX6+srDQ6duxoPPLII/ZjJSUlRmxsrLF48eKGGCJC0JVXXmnccMMNDscyMzONCRMmGIbBvIFrkozly5fbn3szT7777jtDkvH555/bz3nvvfcMi8ViFBUVNci4WXEKghMnTuiLL77QoEGD7MeaNGmiQYMGacOGDUEcGULZoUOHJEmJiYmSpC+++EIVFRUO86h79+466aSTmEfQtGnTdOWVVzrMD4l5A/fefvtt9ezZU2PHjlX79u113nnn6fnnn7e/vm3bNu3evdth7rRq1UoXX3wxcyeC9e7dWx9//LF+/PFHSdJ//vMfrVu3TsOGDZPEvIF3vJknGzZsUOvWrdWzZ0/7OYMGDVKTJk3073//u0HGGd0g7wIH+/btk9VqVYcOHRyOd+jQQT/88EOQRoVQVllZqZycHPXp00dnnXWWJGn37t1q2rSpWrdu7XBuhw4dtHv37iCMEqFiyZIl+vLLL/X55587vca8gTtbt27V/PnzNXPmTP35z3/W559/rltvvVVNmzbVpEmT7PPD1d9dzJ3Ideedd6q0tFTdu3dXVFSUrFarHnjgAU2YMEGSmDfwijfzZPfu3Wrfvr3D69HR0UpMTGywuUTgBISBadOm6ZtvvtG6deuCPRSEuB07dmjGjBnKz89XXFxcsIeDMFJZWamePXvqwQcflCSdd955+uabb/Tss89q0qRJQR4dQtXSpUv12muvKTc3V2eeeaY2b96snJwcderUiXmDRodUvSBo166doqKinLpY/frrr+rYsWOQRoVQNX36dK1cuVKrVq1Samqq/XjHjh114sQJlZSUOJzPPIpsX3zxhfbs2aPzzz9f0dHRio6O1urVq/Xkk08qOjpaHTp0YN7ApeTkZJ1xxhkOx04//XRt375dkuzzg7+7UN3tt9+uO++8U+PGjdPZZ5+t6667Trfddpvmzp0riXkD73gzTzp27OjURO23337TgQMHGmwuETgFQdOmTXXBBRfo448/th+rrKzUxx9/rF69egVxZAglhmFo+vTpWr58uT755BN16dLF4fULLrhAMTExDvNoy5Yt2r59O/Mogl122WX6+uuvtXnzZvtXz549NWHCBPuvmTdwpU+fPk5bHvz444/q3LmzJKlLly7q2LGjw9wpLS3Vv//9b+ZOBCsrK1OTJo7/nIyKilJlZaUk5g2848086dWrl0pKSvTFF1/Yz/nkk09UWVmpiy++uGEG2iAtKOBkyZIlRmxsrLFo0SLju+++M6ZMmWK0bt3a2L17d7CHhhBx8803G61atTIKCgqM4uJi+1dZWZn9nJtuusk46aSTjE8++cTYtGmT0atXL6NXr15BHDVCUfWueobBvIFrn332mREdHW088MADxk8//WS89tprRnx8vPHqq6/az3nooYeM1q1bGytWrDD++9//GiNGjDC6dOliHDt2LIgjRzBNmjTJSElJMVauXGls27bNWLZsmdGuXTvjjjvusJ/DvIFhmN1ev/rqK+Orr74yJBmPPfaY8dVXXxm//PKLYRjezZPLL7/cOO+884x///vfxrp164xTTjnFGD9+fIN9DwROQfSPf/zDOOmkk4ymTZsaF110kbFx48ZgDwkhRJLLr4ULF9rPOXbsmDF16lSjTZs2Rnx8vDFq1CijuLg4eINGSKoZODFv4M4777xjnHXWWUZsbKzRvXt3Y8GCBQ6vV1ZWGvfcc4/RoUMHIzY21rjsssuMLVu2BGm0CAWlpaXGjBkzjJNOOsmIi4szTj75ZOPuu+82ysvL7ecwb2AYhrFq1SqX/66ZNGmSYRjezZP9+/cb48ePN1q0aGEkJCQY119/vXH48OEG+x4shlFta2cAAAAAgBNqnAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAACohcVi0VtvvRXsYQAAgozACQAQsiZPniyLxeL0dfnllwd7aACACBMd7AEAAFCbyy+/XAsXLnQ4FhsbG6TRAAAiFStOAICQFhsbq44dOzp8tWnTRpKZRjd//nwNGzZMzZo108knn6y8vDyH67/++mtdeumlatasmdq2baspU6boyJEjDue8+OKLOvPMMxUbG6vk5GRNnz7d4fV9+/Zp1KhRio+P1ymnnKK3337b/trBgwc1YcIEJSUlqVmzZjrllFOcAj0AQPgjcAIAhLV77rlHo0eP1n/+8x9NmDBB48aN0/fffy9JOnr0qIYOHao2bdro888/1xtvvKGPPvrIITCaP3++pk2bpilTpujrr7/W22+/rW7dujm8x5w5c5SVlaX//ve/uuKKKzRhwgQdOHDA/v7fffed3nvvPX3//feaP3++2rVr13AfAACgQVgMwzCCPQgAAFyZPHmyXn31VcXFxTkc//Of/6w///nPslgsuummmzR//nz7a5dcconOP/98PfPMM3r++ef1pz/9STt27FDz5s0lSe+++66GDx+uXbt2qUOHDkpJSdH111+v+++/3+UYLBaL/vKXv+i+++6TZAZjLVq00HvvvafLL79cV199tdq1a6cXX3yxnj4FAEAooMYJABDSBg4c6BAYSVJiYqL917169XJ4rVevXtq8ebMk6fvvv1ePHj3sQZMk9enTR5WVldqyZYssFot27dqlyy67rNYxnHPOOfZfN2/eXAkJCdqzZ48k6eabb9bo0aP15ZdfasiQIRo5cqR69+7t1/cKAAhdBE4AgJDWvHlzp9S5QGnWrJlX58XExDg8t1gsqqyslCQNGzZMv/zyi959913l5+frsssu07Rp0/Too48GfLwAgOChxgkAENY2btzo9Pz000+XJJ1++un6z3/+o6NHj9pf//TTT9WkSROddtppatmypdLT0/Xxxx/XaQxJSUmaNGmSXn31VT3++ONasGBBne4HAAg9rDgBAEJaeXm5du/e7XAsOjra3oDhjTfeUM+ePdW3b1+99tpr+uyzz/TCCy9IkiZMmKBZs2Zp0qRJmj17tvbu3atbbrlF1113nTp06CBJmj17tm666Sa1b99ew4YN0+HDh/Xpp5/qlltu8Wp89957ry644AKdeeaZKi8v18qVK+2BGwCg8SBwAgCEtPfff1/JyckOx0477TT98MMPksyOd0uWLNHUqVOVnJysxYsX64wzzpAkxcfH64MPPtCMGTN04YUXKj4+XqNHj9Zjjz1mv9ekSZN0/Phx/f3vf9cf//hHtWvXTmPGjPF6fE2bNtVdd92lwsJCNWvWTBkZGVqyZEkAvnMAQCihqx4AIGxZLBYtX75cI0eODPZQAACNHDVOAAAAAOABgRMAAAAAeECNEwAgbJFtDgBoKKw4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAe/D9KcbfSZkpy3gAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "epochs = list(range(configs_dict[\"max_training_steps\"]))\n", + "loss_values = ft_res[0].finetuning_losses\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b')\n", + "\n", + "# Set plot labels and title\n", + "plt.xlabel('Epochs')\n", + "plt.ylabel('Loss Value')\n", + "plt.title('Loss Value vs. Number of Epochs')\n", + "\n", + "plt.grid(True)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save finetuned model to HuggingFace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subprocess.run(['python', '../../utils/upload_peft_model.py'] + f\"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly\".split())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Stop LLM Co-serving system" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-07-22 06:46:20 - ###PEFT DEBUGGING### Background serving task completed.\n", + "Background server stopped.\n" + ] + } + ], + "source": [ + "llm.stop_server()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference all over again with the finetuned model" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n", + "Loading tokenizer...\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n", + "Loading tokenizer...\n", + "[0 - 7ff1caf83280] 0.270628 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270673 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270699 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270744 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "[0 - 7ff1caf83280] 0.270753 {3}{Mapper}: Enabled Control Replication Optimizations.\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "workSpaceSize (128 MB)\n", + "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n", + "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n", + "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n", + "Loading tokenizer...\n", + "Adding layer layers.0.mlp.down_proj.lora\n", + "Adding layer layers.1.mlp.down_proj.lora\n", + "Adding layer layers.2.mlp.down_proj.lora\n", + "Adding layer layers.3.mlp.down_proj.lora\n", + "Adding layer layers.4.mlp.down_proj.lora\n", + "Adding layer layers.5.mlp.down_proj.lora\n", + "Adding layer layers.6.mlp.down_proj.lora\n", + "Adding layer layers.7.mlp.down_proj.lora\n", + "Adding layer layers.8.mlp.down_proj.lora\n", + "Adding layer layers.9.mlp.down_proj.lora\n", + "Adding layer layers.10.mlp.down_proj.lora\n", + "Adding layer layers.11.mlp.down_proj.lora\n", + "Adding layer layers.12.mlp.down_proj.lora\n", + "Adding layer layers.13.mlp.down_proj.lora\n", + "Adding layer layers.14.mlp.down_proj.lora\n", + "Adding layer layers.15.mlp.down_proj.lora\n", + "Adding layer layers.16.mlp.down_proj.lora\n", + "Adding layer layers.17.mlp.down_proj.lora\n", + "Adding layer layers.18.mlp.down_proj.lora\n", + "Adding layer layers.19.mlp.down_proj.lora\n", + "Adding layer layers.20.mlp.down_proj.lora\n", + "Adding layer layers.21.mlp.down_proj.lora\n", + "Adding layer layers.22.mlp.down_proj.lora\n", + "Adding layer layers.23.mlp.down_proj.lora\n", + "Adding layer layers.24.mlp.down_proj.lora\n", + "Adding layer layers.25.mlp.down_proj.lora\n", + "Adding layer layers.26.mlp.down_proj.lora\n", + "Adding layer layers.27.mlp.down_proj.lora\n", + "Adding layer layers.28.mlp.down_proj.lora\n", + "Adding layer layers.29.mlp.down_proj.lora\n", + "Adding layer layers.30.mlp.down_proj.lora\n", + "Adding layer layers.31.mlp.down_proj.lora\n", + "Background server started.\n", + "[]\n", + "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Starting background serving task.\n", + "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Updated models' configuration.\n", + "###PEFT DEBUGGING### LLM Model object exists.\n", + "###PEFT DEBUGGING### Model object exists.\n", + "###PEFT DEBUGGING### Model object still exists.\n", + "###PEFT DEBUGGING### Entering compile_inference.\n", + "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n", + "###PEFT DEBUGGING### Launching graph optimization task.\n", + "num_nodes = 1 num_gpus_per_node = 1\n", + "[0]10445\n", + "[1]649\n", + "[2]6730\n", + "[3]2053\n", + "[4]18167\n", + "[5]369\n", + "[6]1317\n", + "[7]2085\n", + "[8]3090\n", + "[9]30\n", + "No small speculative model registered, using incremental decoding.\n", + "[0 - 7ff1caf83280] 1.100415 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n", + "optimal_views.size = 262\n", + "views.size() = 262\n", + "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n", + "###PEFT DEBUGGING### Starting inplace optimizations.\n", + "###PEFT DEBUGGING### Mapping output tensors.\n", + "ndim(1) dims[1 0 0 0]\n", + "###PEFT DEBUGGING### Setting up NCCL communications.\n", + "###PEFT DEBUGGING### compile_inference completed successfully.\n", + "Loading weight file embed_tokens.weight\n", + "Loading weight file layers.0.input_layernorm.weight\n", + "Loading weight file layers.0.self_attn.q_proj.weight\n", + "Loading weight file layers.0.self_attn.k_proj.weight\n", + "Loading weight file layers.0.self_attn.v_proj.weight\n", + "Loading weight file layers.0.self_attn.o_proj.weight\n", + "Loading weight file layers.0.post_attention_layernorm.weight\n", + "Loading weight file layers.0.mlp.gate_proj.weight\n", + "Loading weight file layers.0.mlp.up_proj.weight\n", + "Loading weight file layers.0.mlp.down_proj.weight\n", + "Loading weight file layers.1.input_layernorm.weight\n", + "Loading weight file layers.1.self_attn.q_proj.weight\n", + "Loading weight file layers.1.self_attn.k_proj.weight\n", + "Loading weight file layers.1.self_attn.v_proj.weight\n", + "Loading weight file layers.1.self_attn.o_proj.weight\n", + "Loading weight file layers.1.post_attention_layernorm.weight\n", + "Loading weight file layers.1.mlp.gate_proj.weight\n", + "Loading weight file layers.1.mlp.up_proj.weight\n", + "Loading weight file layers.1.mlp.down_proj.weight\n", + "Loading weight file layers.2.input_layernorm.weight\n", + "Loading weight file layers.2.self_attn.q_proj.weight\n", + "Loading weight file layers.2.self_attn.k_proj.weight\n", + "Loading weight file layers.2.self_attn.v_proj.weight\n", + "Loading weight file layers.2.self_attn.o_proj.weight\n", + "Loading weight file layers.2.post_attention_layernorm.weight\n", + "Loading weight file layers.2.mlp.gate_proj.weight\n", + "Loading weight file layers.2.mlp.up_proj.weight\n", + "Loading weight file layers.2.mlp.down_proj.weight\n", + "Loading weight file layers.3.input_layernorm.weight\n", + "Loading weight file layers.3.self_attn.q_proj.weight\n", + "Loading weight file layers.3.self_attn.k_proj.weight\n", + "Loading weight file layers.3.self_attn.v_proj.weight\n", + "Loading weight file layers.3.self_attn.o_proj.weight\n", + "Loading weight file layers.3.post_attention_layernorm.weight\n", + "Loading weight file layers.3.mlp.gate_proj.weight\n", + "Loading weight file layers.3.mlp.up_proj.weight\n", + "Loading weight file layers.3.mlp.down_proj.weight\n", + "Loading weight file layers.4.input_layernorm.weight\n", + "Loading weight file layers.4.self_attn.q_proj.weight\n", + "Loading weight file layers.4.self_attn.k_proj.weight\n", + "Loading weight file layers.4.self_attn.v_proj.weight\n", + "Loading weight file layers.4.self_attn.o_proj.weight\n", + "Loading weight file layers.4.post_attention_layernorm.weight\n", + "Loading weight file layers.4.mlp.gate_proj.weight\n", + "Loading weight file layers.4.mlp.up_proj.weight\n", + "Loading weight file layers.4.mlp.down_proj.weight\n", + "Loading weight file layers.5.input_layernorm.weight\n", + "Loading weight file layers.5.self_attn.q_proj.weight\n", + "Loading weight file layers.5.self_attn.k_proj.weight\n", + "Loading weight file layers.5.self_attn.v_proj.weight\n", + "Loading weight file layers.5.self_attn.o_proj.weight\n", + "Loading weight file layers.5.post_attention_layernorm.weight\n", + "Loading weight file layers.5.mlp.gate_proj.weight\n", + "Loading weight file layers.5.mlp.up_proj.weight\n", + "Loading weight file layers.5.mlp.down_proj.weight\n", + "Loading weight file layers.6.input_layernorm.weight\n", + "Loading weight file layers.6.self_attn.q_proj.weight\n", + "Loading weight file layers.6.self_attn.k_proj.weight\n", + "Loading weight file layers.6.self_attn.v_proj.weight\n", + "Loading weight file layers.6.self_attn.o_proj.weight\n", + "Loading weight file layers.6.post_attention_layernorm.weight\n", + "Loading weight file layers.6.mlp.gate_proj.weight\n", + "Loading weight file layers.6.mlp.up_proj.weight\n", + "Loading weight file layers.6.mlp.down_proj.weight\n", + "Loading weight file layers.7.input_layernorm.weight\n", + "Loading weight file layers.7.self_attn.q_proj.weight\n", + "Loading weight file layers.7.self_attn.k_proj.weight\n", + "Loading weight file layers.7.self_attn.v_proj.weight\n", + "Loading weight file layers.7.self_attn.o_proj.weight\n", + "Loading weight file layers.7.post_attention_layernorm.weight\n", + "Loading weight file layers.7.mlp.gate_proj.weight\n", + "Loading weight file layers.7.mlp.up_proj.weight\n", + "Loading weight file layers.7.mlp.down_proj.weight\n", + "Loading weight file layers.8.input_layernorm.weight\n", + "Loading weight file layers.8.self_attn.q_proj.weight\n", + "Loading weight file layers.8.self_attn.k_proj.weight\n", + "Loading weight file layers.8.self_attn.v_proj.weight\n", + "Loading weight file layers.8.self_attn.o_proj.weight\n", + "Loading weight file layers.8.post_attention_layernorm.weight\n", + "Loading weight file layers.8.mlp.gate_proj.weight\n", + "Loading weight file layers.8.mlp.up_proj.weight\n", + "Loading weight file layers.8.mlp.down_proj.weight\n", + "Loading weight file layers.9.input_layernorm.weight\n", + "Loading weight file layers.9.self_attn.q_proj.weight\n", + "Loading weight file layers.9.self_attn.k_proj.weight\n", + "Loading weight file layers.9.self_attn.v_proj.weight\n", + "Loading weight file layers.9.self_attn.o_proj.weight\n", + "Loading weight file layers.9.post_attention_layernorm.weight\n", + "Loading weight file layers.9.mlp.gate_proj.weight\n", + "Loading weight file layers.9.mlp.up_proj.weight\n", + "Loading weight file layers.9.mlp.down_proj.weight\n", + "Loading weight file layers.10.input_layernorm.weight\n", + "Loading weight file layers.10.self_attn.q_proj.weight\n", + "Loading weight file layers.10.self_attn.k_proj.weight\n", + "Loading weight file layers.10.self_attn.v_proj.weight\n", + "Loading weight file layers.10.self_attn.o_proj.weight\n", + "Loading weight file layers.10.post_attention_layernorm.weight\n", + "Loading weight file layers.10.mlp.gate_proj.weight\n", + "Loading weight file layers.10.mlp.up_proj.weight\n", + "Loading weight file layers.10.mlp.down_proj.weight\n", + "Loading weight file layers.11.input_layernorm.weight\n", + "Loading weight file layers.11.self_attn.q_proj.weight\n", + "Loading weight file layers.11.self_attn.k_proj.weight\n", + "Loading weight file layers.11.self_attn.v_proj.weight\n", + "Loading weight file layers.11.self_attn.o_proj.weight\n", + "Loading weight file layers.11.post_attention_layernorm.weight\n", + "Loading weight file layers.11.mlp.gate_proj.weight\n", + "Loading weight file layers.11.mlp.up_proj.weight\n", + "Loading weight file layers.11.mlp.down_proj.weight\n", + "Loading weight file layers.12.input_layernorm.weight\n", + "Loading weight file layers.12.self_attn.q_proj.weight\n", + "Loading weight file layers.12.self_attn.k_proj.weight\n", + "Loading weight file layers.12.self_attn.v_proj.weight\n", + "Loading weight file layers.12.self_attn.o_proj.weight\n", + "Loading weight file layers.12.post_attention_layernorm.weight\n", + "Loading weight file layers.12.mlp.gate_proj.weight\n", + "Loading weight file layers.12.mlp.up_proj.weight\n", + "Loading weight file layers.12.mlp.down_proj.weight\n", + "Loading weight file layers.13.input_layernorm.weight\n", + "Loading weight file layers.13.self_attn.q_proj.weight\n", + "Loading weight file layers.13.self_attn.k_proj.weight\n", + "Loading weight file layers.13.self_attn.v_proj.weight\n", + "Loading weight file layers.13.self_attn.o_proj.weight\n", + "Loading weight file layers.13.post_attention_layernorm.weight\n", + "Loading weight file layers.13.mlp.gate_proj.weight\n", + "Loading weight file layers.13.mlp.up_proj.weight\n", + "Loading weight file layers.13.mlp.down_proj.weight\n", + "Loading weight file layers.14.input_layernorm.weight\n", + "Loading weight file layers.14.self_attn.q_proj.weight\n", + "Loading weight file layers.14.self_attn.k_proj.weight\n", + "Loading weight file layers.14.self_attn.v_proj.weight\n", + "Loading weight file layers.14.self_attn.o_proj.weight\n", + "Loading weight file layers.14.post_attention_layernorm.weight\n", + "Loading weight file layers.14.mlp.gate_proj.weight\n", + "Loading weight file layers.14.mlp.up_proj.weight\n", + "Loading weight file layers.14.mlp.down_proj.weight\n", + "Loading weight file layers.15.input_layernorm.weight\n", + "Loading weight file layers.15.self_attn.q_proj.weight\n", + "Loading weight file layers.15.self_attn.k_proj.weight\n", + "Loading weight file layers.15.self_attn.v_proj.weight\n", + "Loading weight file layers.15.self_attn.o_proj.weight\n", + "Loading weight file layers.15.post_attention_layernorm.weight\n", + "Loading weight file layers.15.mlp.gate_proj.weight\n", + "Loading weight file layers.15.mlp.up_proj.weight\n", + "Loading weight file layers.15.mlp.down_proj.weight\n", + "Loading weight file layers.16.input_layernorm.weight\n", + "Loading weight file layers.16.self_attn.q_proj.weight\n", + "Loading weight file layers.16.self_attn.k_proj.weight\n", + "Loading weight file layers.16.self_attn.v_proj.weight\n", + "Loading weight file layers.16.self_attn.o_proj.weight\n", + "Loading weight file layers.16.post_attention_layernorm.weight\n", + "Loading weight file layers.16.mlp.gate_proj.weight\n", + "Loading weight file layers.16.mlp.up_proj.weight\n", + "Loading weight file layers.16.mlp.down_proj.weight\n", + "Loading weight file layers.17.input_layernorm.weight\n", + "Loading weight file layers.17.self_attn.q_proj.weight\n", + "Loading weight file layers.17.self_attn.k_proj.weight\n", + "Loading weight file layers.17.self_attn.v_proj.weight\n", + "Loading weight file layers.17.self_attn.o_proj.weight\n", + "Loading weight file layers.17.post_attention_layernorm.weight\n", + "Loading weight file layers.17.mlp.gate_proj.weight\n", + "Loading weight file layers.17.mlp.up_proj.weight\n", + "Loading weight file layers.17.mlp.down_proj.weight\n", + "Loading weight file layers.18.input_layernorm.weight\n", + "Loading weight file layers.18.self_attn.q_proj.weight\n", + "Loading weight file layers.18.self_attn.k_proj.weight\n", + "Loading weight file layers.18.self_attn.v_proj.weight\n", + "Loading weight file layers.18.self_attn.o_proj.weight\n", + "Loading weight file layers.18.post_attention_layernorm.weight\n", + "Loading weight file layers.18.mlp.gate_proj.weight\n", + "Loading weight file layers.18.mlp.up_proj.weight\n", + "Loading weight file layers.18.mlp.down_proj.weight\n", + "Loading weight file layers.19.input_layernorm.weight\n", + "Loading weight file layers.19.self_attn.q_proj.weight\n", + "Loading weight file layers.19.self_attn.k_proj.weight\n", + "Loading weight file layers.19.self_attn.v_proj.weight\n", + "Loading weight file layers.19.self_attn.o_proj.weight\n", + "Loading weight file layers.19.post_attention_layernorm.weight\n", + "Loading weight file layers.19.mlp.gate_proj.weight\n", + "Loading weight file layers.19.mlp.up_proj.weight\n", + "Loading weight file layers.19.mlp.down_proj.weight\n", + "Loading weight file layers.20.input_layernorm.weight\n", + "Loading weight file layers.20.self_attn.q_proj.weight\n", + "Loading weight file layers.20.self_attn.k_proj.weight\n", + "Loading weight file layers.20.self_attn.v_proj.weight\n", + "Loading weight file layers.20.self_attn.o_proj.weight\n", + "Loading weight file layers.20.post_attention_layernorm.weight\n", + "Loading weight file layers.20.mlp.gate_proj.weight\n", + "Loading weight file layers.20.mlp.up_proj.weight\n", + "Loading weight file layers.20.mlp.down_proj.weight\n", + "Loading weight file layers.21.input_layernorm.weight\n", + "Loading weight file layers.21.self_attn.q_proj.weight\n", + "Loading weight file layers.21.self_attn.k_proj.weight\n", + "Loading weight file layers.21.self_attn.v_proj.weight\n", + "Loading weight file layers.21.self_attn.o_proj.weight\n", + "Loading weight file layers.21.post_attention_layernorm.weight\n", + "Loading weight file layers.21.mlp.gate_proj.weight\n", + "Loading weight file layers.21.mlp.up_proj.weight\n", + "Loading weight file layers.21.mlp.down_proj.weight\n", + "Loading weight file layers.22.input_layernorm.weight\n", + "Loading weight file layers.22.self_attn.q_proj.weight\n", + "Loading weight file layers.22.self_attn.k_proj.weight\n", + "Loading weight file layers.22.self_attn.v_proj.weight\n", + "Loading weight file layers.22.self_attn.o_proj.weight\n", + "Loading weight file layers.22.post_attention_layernorm.weight\n", + "Loading weight file layers.22.mlp.gate_proj.weight\n", + "Loading weight file layers.22.mlp.up_proj.weight\n", + "Loading weight file layers.22.mlp.down_proj.weight\n", + "Loading weight file layers.23.input_layernorm.weight\n", + "Loading weight file layers.23.self_attn.q_proj.weight\n", + "Loading weight file layers.23.self_attn.k_proj.weight\n", + "Loading weight file layers.23.self_attn.v_proj.weight\n", + "Loading weight file layers.23.self_attn.o_proj.weight\n", + "Loading weight file layers.23.post_attention_layernorm.weight\n", + "Loading weight file layers.23.mlp.gate_proj.weight\n", + "Loading weight file layers.23.mlp.up_proj.weight\n", + "Loading weight file layers.23.mlp.down_proj.weight\n", + "Loading weight file layers.24.input_layernorm.weight\n", + "Loading weight file layers.24.self_attn.q_proj.weight\n", + "Loading weight file layers.24.self_attn.k_proj.weight\n", + "Loading weight file layers.24.self_attn.v_proj.weight\n", + "Loading weight file layers.24.self_attn.o_proj.weight\n", + "Loading weight file layers.24.post_attention_layernorm.weight\n", + "Loading weight file layers.24.mlp.gate_proj.weight\n", + "Loading weight file layers.24.mlp.up_proj.weight\n", + "Loading weight file layers.24.mlp.down_proj.weight\n", + "Loading weight file layers.25.input_layernorm.weight\n", + "Loading weight file layers.25.self_attn.q_proj.weight\n", + "Loading weight file layers.25.self_attn.k_proj.weight\n", + "Loading weight file layers.25.self_attn.v_proj.weight\n", + "Loading weight file layers.25.self_attn.o_proj.weight\n", + "Loading weight file layers.25.post_attention_layernorm.weight\n", + "Loading weight file layers.25.mlp.gate_proj.weight\n", + "Loading weight file layers.25.mlp.up_proj.weight\n", + "Loading weight file layers.25.mlp.down_proj.weight\n", + "Loading weight file layers.26.input_layernorm.weight\n", + "Loading weight file layers.26.self_attn.q_proj.weight\n", + "Loading weight file layers.26.self_attn.k_proj.weight\n", + "Loading weight file layers.26.self_attn.v_proj.weight\n", + "Loading weight file layers.26.self_attn.o_proj.weight\n", + "Loading weight file layers.26.post_attention_layernorm.weight\n", + "Loading weight file layers.26.mlp.gate_proj.weight\n", + "Loading weight file layers.26.mlp.up_proj.weight\n", + "Loading weight file layers.26.mlp.down_proj.weight\n", + "Loading weight file layers.27.input_layernorm.weight\n", + "Loading weight file layers.27.self_attn.q_proj.weight\n", + "Loading weight file layers.27.self_attn.k_proj.weight\n", + "Loading weight file layers.27.self_attn.v_proj.weight\n", + "Loading weight file layers.27.self_attn.o_proj.weight\n", + "Loading weight file layers.27.post_attention_layernorm.weight\n", + "Loading weight file layers.27.mlp.gate_proj.weight\n", + "Loading weight file layers.27.mlp.up_proj.weight\n", + "Loading weight file layers.27.mlp.down_proj.weight\n", + "Loading weight file layers.28.input_layernorm.weight\n", + "Loading weight file layers.28.self_attn.q_proj.weight\n", + "Loading weight file layers.28.self_attn.k_proj.weight\n", + "Loading weight file layers.28.self_attn.v_proj.weight\n", + "Loading weight file layers.28.self_attn.o_proj.weight\n", + "Loading weight file layers.28.post_attention_layernorm.weight\n", + "Loading weight file layers.28.mlp.gate_proj.weight\n", + "Loading weight file layers.28.mlp.up_proj.weight\n", + "Loading weight file layers.28.mlp.down_proj.weight\n", + "Loading weight file layers.29.input_layernorm.weight\n", + "Loading weight file layers.29.self_attn.q_proj.weight\n", + "Loading weight file layers.29.self_attn.k_proj.weight\n", + "Loading weight file layers.29.self_attn.v_proj.weight\n", + "Loading weight file layers.29.self_attn.o_proj.weight\n", + "Loading weight file layers.29.post_attention_layernorm.weight\n", + "Loading weight file layers.29.mlp.gate_proj.weight\n", + "Loading weight file layers.29.mlp.up_proj.weight\n", + "Loading weight file layers.29.mlp.down_proj.weight\n", + "Loading weight file layers.30.input_layernorm.weight\n", + "Loading weight file layers.30.self_attn.q_proj.weight\n", + "Loading weight file layers.30.self_attn.k_proj.weight\n", + "Loading weight file layers.30.self_attn.v_proj.weight\n", + "Loading weight file layers.30.self_attn.o_proj.weight\n", + "Loading weight file layers.30.post_attention_layernorm.weight\n", + "Loading weight file layers.30.mlp.gate_proj.weight\n", + "Loading weight file layers.30.mlp.up_proj.weight\n", + "Loading weight file layers.30.mlp.down_proj.weight\n", + "Loading weight file layers.31.input_layernorm.weight\n", + "Loading weight file layers.31.self_attn.q_proj.weight\n", + "Loading weight file layers.31.self_attn.k_proj.weight\n", + "Loading weight file layers.31.self_attn.v_proj.weight\n", + "Loading weight file layers.31.self_attn.o_proj.weight\n", + "Loading weight file layers.31.post_attention_layernorm.weight\n", + "Loading weight file layers.31.mlp.gate_proj.weight\n", + "Loading weight file layers.31.mlp.up_proj.weight\n", + "Loading weight file layers.31.mlp.down_proj.weight\n", + "Loading weight file norm.weight\n", + "Loading weight file lm_head.weight\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n", + "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n", + "[0 - 7ff1680b6740] 16.224181 {3}{RequestManager}: Output token is: 3639\n", + "[0 - 7ff1680b6740] 16.321885 {3}{RequestManager}: Output token is: 374\n", + "[0 - 7ff168092740] 16.407712 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7ff1680b6740] 16.492788 {3}{RequestManager}: Output token is: 2944\n", + "[0 - 7ff168092740] 16.563500 {3}{RequestManager}: Output token is: 4920\n", + "[0 - 7ff168092740] 16.624616 {3}{RequestManager}: Output token is: 279\n", + "[0 - 7ff168092740] 16.675778 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 16.725625 {3}{RequestManager}: Output token is: 13272\n", + "[0 - 7ff168092740] 16.776205 {3}{RequestManager}: Output token is: 315\n", + "[0 - 7ff168092740] 16.827883 {3}{RequestManager}: Output token is: 41389\n", + "[0 - 7ff168092740] 16.878348 {3}{RequestManager}: Output token is: 2715\n", + "[0 - 7ff168092740] 16.929025 {3}{RequestManager}: Output token is: 288\n", + "[0 - 7ff168092740] 16.979287 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff1680b6740] 17.029879 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 17.078696 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 17.127942 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 17.177796 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 17.227023 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff1680b6740] 17.277136 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 17.328143 {3}{RequestManager}: Output token is: 64614\n", + "[0 - 7ff1680b6740] 17.378508 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 17.430618 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 17.482129 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 17.533479 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 17.584503 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 17.634591 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 17.685727 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 17.736768 {3}{RequestManager}: Output token is: 14535\n", + "[0 - 7ff168092740] 17.785909 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 17.836515 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 17.886526 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 17.936502 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 17.986222 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 18.037888 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 18.088468 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 18.138261 {3}{RequestManager}: Output token is: 25212\n", + "[0 - 7ff168092740] 18.187102 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 18.237270 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 18.289979 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 18.340895 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 18.391145 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 18.441155 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 18.499716 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 18.552423 {3}{RequestManager}: Output token is: 97814\n", + "[0 - 7ff168092740] 18.603261 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 18.654986 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 18.706227 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 18.756543 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 18.807690 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 18.857508 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 18.907649 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 18.958208 {3}{RequestManager}: Output token is: 41759\n", + "[0 - 7ff168092740] 19.009971 {3}{RequestManager}: Output token is: 388\n", + "[0 - 7ff168092740] 19.060626 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 19.112370 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 19.161425 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 19.206435 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 19.254004 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 19.306102 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 19.356853 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 19.408861 {3}{RequestManager}: Output token is: 89435\n", + "[0 - 7ff1680b6740] 19.460391 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff1680b6740] 19.511207 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 19.565692 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 19.617057 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 19.669739 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 19.722325 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff1680b6740] 19.773583 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 19.824646 {3}{RequestManager}: Output token is: 68550\n", + "[0 - 7ff1680b6740] 19.876650 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff1680b6740] 19.926939 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 19.977325 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 20.028247 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 20.078419 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 20.128614 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 20.179748 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 20.230542 {3}{RequestManager}: Output token is: 18311\n", + "[0 - 7ff1680b6740] 20.281634 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 20.330089 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 20.375491 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 20.422220 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 20.475078 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 20.526058 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 20.577651 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 20.628505 {3}{RequestManager}: Output token is: 7013\n", + "[0 - 7ff168092740] 20.681354 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 20.734160 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 20.786299 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 20.837268 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 20.888265 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 20.939708 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 20.990707 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 21.041260 {3}{RequestManager}: Output token is: 18742\n", + "[0 - 7ff1680b6740] 21.091386 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 21.145432 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff168092740] 21.197149 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 21.249242 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 21.301514 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 21.352632 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 21.404018 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 21.455101 {3}{RequestManager}: Output token is: 56994\n", + "[0 - 7ff1680b6740] 21.506371 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 21.559369 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 21.611370 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff168092740] 21.663655 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff1680b6740] 21.715270 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff168092740] 21.766481 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff168092740] 21.818563 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff168092740] 21.872108 {3}{RequestManager}: Output token is: 29505\n", + "[0 - 7ff168092740] 21.922670 {3}{RequestManager}: Output token is: 30\n", + "[0 - 7ff168092740] 21.973973 {3}{RequestManager}: Output token is: 8595\n", + "[0 - 7ff1680b6740] 22.024297 {3}{RequestManager}: Output token is: 656\n", + "[0 - 7ff1680b6740] 22.076266 {3}{RequestManager}: Output token is: 1063\n", + "[0 - 7ff168092740] 22.127594 {3}{RequestManager}: Output token is: 10099\n", + "[0 - 7ff1680b6740] 22.179008 {3}{RequestManager}: Output token is: 617\n", + "[0 - 7ff1680b6740] 22.230414 {3}{RequestManager}: Output token is: 1317\n", + "[0 - 7ff1680b6740] 22.281805 {3}{RequestManager}: Output token is: 993\n", + "[0 - 7ff1680b6740] 22.282235 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n", + "[0 - 7ff1680b6740] 22.282243 {3}{RequestManager}: Final output: <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the long neck of giraffes? Why do some animals have long tails? Why do some animals have long legs? Why do some animals have long ears? Why do some animals have long noses? Why do some animals have long whiskers? Why do some animals have long tongues? Why do some animals have long claws? Why do some animals have long teeth? Why do some animals have long hair? Why do some animals have long fur? Why do some animals have long feathers? Why do some animals have long scales? Why do some animals have long sp\n", + "[0 - 7ff1680b6740] 22.282250 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(15892528.0) finish(22282245.0) latency(6389717.0) ttft(15123707.0)\n", + "2024-07-22 06:43:05 - ###PEFT DEBUGGING### Background serving task completed.\n", + "Background server stopped.\n" + ] + } + ], + "source": [ + "import json, random, subprocess, os\n", + "from datasets import load_dataset\n", + "from types import SimpleNamespace\n", + "from huggingface_hub import HfFolder\n", + "import flexflow.serve as ff\n", + "import matplotlib.pyplot as plt\n", + "\n", + "configs_dict = {\n", + " \"num_gpus\": 1,\n", + " \"memory_per_gpu\": 21000,\n", + " \"zero_copy_memory_per_node\": 40000,\n", + " \"num_cpus\": 4,\n", + " \"legion_utility_processors\": 4,\n", + " \"data_parallelism_degree\": 1,\n", + " \"tensor_parallelism_degree\": 1,\n", + " \"pipeline_parallelism_degree\": 1,\n", + " \"offload\": False,\n", + " \"offload_reserve_space_size\": 8 * 1024, # 8GB\n", + " \"use_4bit_quantization\": False,\n", + " \"use_8bit_quantization\": False,\n", + " \"enable_peft\": True,\n", + " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", + " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", + " \"profiling\": False,\n", + " \"inference_debugging\": False,\n", + " \"fusion\": False,\n", + " \"max_requests_per_batch\": 1,\n", + " \"max_sequence_length\": 128,\n", + " \"max_tokens_per_batch\": 128,\n", + " \"max_training_steps\": 100,\n", + " \"seed\": 42,\n", + "}\n", + "model_configs = {\n", + " \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n", + " \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n", + " \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n", + " \"refresh_cache\": False,\n", + " \"full_precision\": False,\n", + " # relative paths\n", + " \"inference_dataset\": \"inference_dataset.json\",\n", + " \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n", + " \"output_file\": \"peft_demo.txt\",\n", + "}\n", + "generation_configs = {\n", + " \"do_sample\": False,\n", + " \"temperature\": 0.9,\n", + " \"topp\": 0.8,\n", + " \"topk\": 1,\n", + "}\n", + "finetuning_configs = {\n", + " \"learning_rate\": 0.001,\n", + " \"momentum\": 0.0,\n", + " \"weight_decay\": 0.0,\n", + " \"nesterov\": False,\n", + "}\n", + "# Merge dictionaries\n", + "configs_dict.update(model_configs)\n", + "configs_dict.update(generation_configs)\n", + "configs_dict.update(finetuning_configs)\n", + "\n", + "configs = SimpleNamespace(**configs_dict)\n", + "\n", + "\n", + "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n", + "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n", + "\n", + "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n", + "ff.init(configs_dict)\n", + "\n", + "# Create the FlexFlow LLM\n", + "ff_data_type = (\n", + " ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n", + ")\n", + "llm = ff.LLM(\n", + " configs.base_model,\n", + " data_type=ff_data_type,\n", + " cache_path=configs.cache_path,\n", + " refresh_cache=configs.refresh_cache,\n", + " output_file=configs.output_file,\n", + ")\n", + "\n", + "lora_inference_config2 = ff.LoraLinearConfig(\n", + " llm.cache_path, \n", + " configs.finetuning_peft_model_id+\"-dolly\",\n", + " base_model_name_or_path=configs.base_model\n", + ")\n", + "llm.add_peft(lora_inference_config2)\n", + "\n", + "\n", + "# Compile the LLM for inference and load the weights into memory\n", + "generation_config = ff.GenerationConfig(\n", + " do_sample=configs.do_sample,\n", + " temperature=configs.temperature,\n", + " topp=configs.topp,\n", + " topk=configs.topk\n", + ")\n", + "llm.compile(\n", + " generation_config,\n", + " max_requests_per_batch=configs.max_requests_per_batch,\n", + " max_seq_length=configs.max_sequence_length,\n", + " max_tokens_per_batch=configs.max_tokens_per_batch,\n", + ")\n", + "\n", + "llm.start_server()\n", + "\n", + "prompts = [s for s in json.load(open(configs.inference_dataset))]\n", + "inference_requests = [\n", + " ff.Request(\n", + " ff.RequestType.REQ_INFERENCE,\n", + " prompt=prompt,\n", + " max_sequence_length=configs.max_sequence_length,\n", + " peft_model_id=llm.get_ff_peft_id(lora_inference_config2),\n", + " )\n", + " for prompt in prompts\n", + "]\n", + "inf_req_res_2 = llm.generate(inference_requests)\n", + "\n", + "llm.stop_server()\n", + "\n", + "with open(\"after_finetuning.txt\", \"w\") as file:\n", + " file.write(str(inf_req_res_2[0].output_text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py new file mode 100644 index 0000000000..9e01b4645b --- /dev/null +++ b/inference/python/peft_demo/demo.py @@ -0,0 +1,240 @@ +import json, random, subprocess +from datasets import load_dataset +from types import SimpleNamespace +from huggingface_hub import HfFolder +import os +import flexflow.serve as ff +import matplotlib.pyplot as plt + + +def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'): + """Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k. + Only the 'open_qa' and 'closed_qa' prompts without context are kept. + The datasets are saved into the files given as arguments. + + Keyword arguments: + dataset_size -- the number of prompts to consider + inference_file_path -- the file in which to save the inference data + finetuning_file_path -- the file in which to save the finetuning data + """ + dataset = load_dataset("databricks/databricks-dolly-15k", split="train") + inference_data = [] + finetuning_data = [] + for row in dataset: + if len(finetuning_data) == finetune_dataset_size: + break + if ("open_qa" in row['category'] or "closed_qa" in row['category']) and len(row['context']) == 0: + inference_data.append(row['instruction']) + finetuning_data.append(row['instruction'] + " " + row['response']) + with open(inference_file_path, 'w') as file: + json.dump(inference_data[:1], file) + with open(finetuning_file_path, 'w') as file: + json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': ')) + + +configs_dict = { + "num_gpus": 1, + "memory_per_gpu": 21000, + "zero_copy_memory_per_node": 40000, + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": True, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "inference_debugging": False, + "fusion": False, + "max_requests_per_batch": 1, + "max_sequence_length": 128, + "max_tokens_per_batch": 128, + "max_training_steps": 100, + "seed": 42, +} +model_configs = { + "base_model": "meta-llama/Meta-Llama-3-8B", + "inference_peft_model_id": "goliaro/llama-3-8b-lora", + "finetuning_peft_model_id": "goliaro/llama-3-8b-lora", + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": False, + # relative paths + "inference_dataset": "inference_dataset.json", + "finetuning_dataset": "/usr/FlexFlow/inference/prompt/peft_dataset.json", + "output_file": "peft_demo.txt", +} +generation_configs = { + "do_sample": False, + "temperature": 0.9, + "topp": 0.8, + "topk": 1, +} +finetuning_configs = { + "learning_rate": 0.001, + "momentum": 0.0, + "weight_decay": 0.0, + "nesterov": False, +} +# Merge dictionaries +configs_dict.update(model_configs) +configs_dict.update(generation_configs) +configs_dict.update(finetuning_configs) + + +random.seed(configs_dict["seed"]) + +create_datasets(inference_file_path=configs_dict["inference_dataset"], + finetuning_file_path=configs_dict["finetuning_dataset"]) + +configs = SimpleNamespace(**configs_dict) + +# Clear output file +with open(configs.output_file, 'w') as file: + file.write('') + +# Download base and peft inference models +args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model] +# hf_token = input("Please enter your HuggingFace personal access token: ") +# subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +subprocess.run(['python', '../../utils/download_peft_model.py'] + args) + + +# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs +ff.init(configs_dict) + +# Create the FlexFlow LLM +ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF +) +llm = ff.LLM( + configs.base_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, +) +# Add inference and/or finetuning lora +lora_inference_config = None +lora_finetuning_config = None +if len(configs.inference_dataset) > 0: + lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.inference_peft_model_id, + base_model_name_or_path=configs.base_model + ) + llm.add_peft(lora_inference_config) +if len(configs.finetuning_dataset) > 0: + lora_finetuning_config = ff.LoraLinearConfig( + llm.cache_path, + configs.finetuning_peft_model_id, + trainable=True, + init_lora_weights=False, + rank=16, + lora_alpha=16.0, + # target_modules = ["down_proj"], + base_model_name_or_path=configs.base_model, + optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, + optimizer_kwargs={ + "learning_rate": configs.learning_rate, + "momentum": configs.momentum, + "weight_decay": configs.weight_decay, + "nesterov": configs.nesterov, + }, + ) + llm.add_peft(lora_finetuning_config) + +# Compile the LLM for inference and load the weights into memory +generation_config = ff.GenerationConfig( + do_sample=configs.do_sample, + temperature=configs.temperature, + topp=configs.topp, + topk=configs.topk +) +enable_peft_finetuning = len(configs.finetuning_dataset) > 0 +llm.compile( + generation_config, + enable_peft_finetuning=enable_peft_finetuning, + max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning), + max_seq_length=configs.max_sequence_length, + max_tokens_per_batch=configs.max_tokens_per_batch, +) + + +llm.start_server() + + +# prompts = [s for s in json.load(open(configs.inference_dataset))] +# inference_requests = [ +# ff.Request( +# ff.RequestType.REQ_INFERENCE, +# prompt=prompt, +# max_sequence_length=configs.max_sequence_length, +# peft_model_id=llm.get_ff_peft_id(lora_inference_config), +# ) +# for prompt in prompts +# ] +# inf_req_res_1 = llm.generate(inference_requests) + + +finetuning_request = ff.Request( + ff.RequestType.REQ_FINETUNING, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(lora_finetuning_config), + dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset), + max_training_steps=configs.max_training_steps, +) +ft_res = llm.generate([finetuning_request]) +for res in ft_res: + print(res.finetuning_losses) + +# exit(0) +# hf_token = input("Please enter your HuggingFace personal access token: ") +# subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +subprocess.run(['python', '../../utils/upload_peft_model.py'] + f"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly".split()) + + + +lora_inference_config = ff.LoraLinearConfig( + llm.cache_path, + configs.finetuning_peft_model_id, + base_model_name_or_path=configs.base_model +) +llm.add_peft(lora_inference_config) + +args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model] +#hf_token = input("Please enter your HuggingFace personal access token: ") +# subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) +# subprocess.run(['python', '../../utils/download_peft_model.py'] + args) + + +prompts = [s for s in json.load(open(configs.inference_dataset))] +inference_requests = [ + ff.Request( + ff.RequestType.REQ_INFERENCE, + prompt=prompt, + max_sequence_length=configs.max_sequence_length, + peft_model_id=llm.get_ff_peft_id(lora_inference_config), + ) + for prompt in prompts +] +inf_req_res_2 = llm.generate(inference_requests) + + +llm.stop_server() + + +print("==Inference result before finetuning: ", inf_req_res_1[0].output_text) +print("==Inference result after finetuning: ", inf_req_res_2[0].output_text) + + +epochs = list(range(configs_dict["max_training_steps"])) +loss_values = ft_res[0].finetuning_losses + +plt.figure(figsize=(10, 6)) +plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b') \ No newline at end of file diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index a6dfa8042e..39529abda3 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -51,9 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -76,7 +79,7 @@ def get_configs(): "full_precision": False, } ], - # "prompt": "", + "prompt": "", "output_file": "", } # Merge dictionaries diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 60233ac8d1..9689080825 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -414,15 +414,18 @@ void FlexFlow::top_level_task(Task const *task, /*allow_exceptions */ true, /*ignore_comments */ true); - std::vector prompts; + std::vector requests; for (auto &prompt : prompt_json) { std::string text = prompt.get(); printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); + // Add inference request + Request inference_req; + inference_req.prompt = text; + inference_req.max_sequence_length = 128; + requests.push_back(inference_req); total_num_requests++; - prompts.push_back(text); - // tree_model.generate(text, 128 /*max_sequence_length*/); } - tree_model.generate(prompts, 128 /*max_sequence_length*/); + tree_model.generate(requests); } // terminate the request manager by stopping the background thread diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py new file mode 100644 index 0000000000..38dd577574 --- /dev/null +++ b/inference/utils/download_peft_model.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +import flexflow.serve as ff +import argparse, os + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--base_model_name", type=str, help="Name of the model to download" + ) + parser.add_argument( + "peft_model_ids", + type=str, + nargs="+", + help="Name of the PEFT model(s) to download", + ) + parser.add_argument( + "--cache-folder", + type=str, + help="Folder to use to store the model(s) assets in FlexFlow format", + default=os.environ.get("FF_CACHE_PATH", ""), + ) + parser.add_argument( + "--refresh-cache", + action="store_true", + help="Use this flag to force the refresh of the model(s) weights/tokenizer cache", + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--full-precision-only", + action="store_true", + help="Only download the full precision version of the weights", + ) + group.add_argument( + "--half-precision-only", + action="store_true", + help="Only download the half precision version of the weights", + ) + args = parser.parse_args() + return args + + +def main(args): + if args.full_precision_only: + data_types = (ff.DataType.DT_FLOAT,) + elif args.half_precision_only: + data_types = (ff.DataType.DT_HALF,) + else: + data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) + + for data_type in data_types: + llm = ff.LLM( + args.base_model_name, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + for peft_model_id in args.peft_model_ids: + lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id) + llm.add_peft(lora_config) + llm.download_hf_weights_if_needed() + llm.download_hf_config() + llm.download_hf_tokenizer_if_needed() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/inference/utils/upload_peft_model.py b/inference/utils/upload_peft_model.py new file mode 100644 index 0000000000..7098d72f98 --- /dev/null +++ b/inference/utils/upload_peft_model.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +import argparse, os +from huggingface_hub import HfApi, HfFolder +from transformers import AutoModelForCausalLM +from peft import LoraConfig, PeftModel +import torch +import numpy as np +import flexflow.serve as ff +from peft import LoraConfig, get_peft_model + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Download a PEFT model with FlexFlow, process it, and upload it to the Hugging Face Hub." + ) + parser.add_argument( + "--peft-model-id", + type=str, + required=True, + help="(Local) Hugging Face model ID of the PEFT model to upload.", + ) + parser.add_argument( + "--upload-peft-model-id", + type=str, + required=True, + help="(Remote) Hugging Face model ID of the PEFT model to upload.", + ) + parser.add_argument( + "--cache-folder", + type=str, + default=os.environ.get( + "FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow") + ), + help="Path to the FlexFlow cache folder", + ) + parser.add_argument( + "--private", + action="store_true", + help="Whether to upload the processed PEFT model as a private model on Hugging Face Hub.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + # Ensure Hugging Face CLI is logged in + if not HfFolder.get_token(): + raise RuntimeError( + "Hugging Face token not found. Please login using `huggingface-cli login`." + ) + + lora_config_filepath = os.path.join( + args.cache_folder, + "finetuned_models", + args.peft_model_id, + "config", + "ff_config.json", + ) + peft_config = ff.LoraLinearConfig.from_jsonfile(lora_config_filepath) + print(peft_config) + hf_peft_config = peft_config.to_hf_config() + print(hf_peft_config) + if peft_config.precision != "fp32" and peft_config.precision != "fp16": + raise ValueError(f"Unsupported precision: {peft_config.precision}") + model = AutoModelForCausalLM.from_pretrained( + peft_config.base_model_name_or_path, + torch_dtype=torch.float32 if peft_config.precision == "fp32" else torch.float16, + device_map="auto", + ) + model = get_peft_model(model, hf_peft_config) + in_dim = model.config.intermediate_size + out_dim = model.config.hidden_size + + weight_folder = os.path.join( + args.cache_folder, "finetuned_models", args.peft_model_id, "weights", "shard_0" + ) + num_shards = 1 + while os.path.exists(weight_folder.replace("shard_0", f"shard_{num_shards}")): + num_shards += 1 + if not in_dim % num_shards == 0: + raise ValueError( + f"Number of shards ({num_shards}) must divide the input dimension ({in_dim})" + ) + lora_weight_files = os.listdir(weight_folder) + for lora_file in sorted(lora_weight_files): + lora_filename = ".weight".join(lora_file.split(".weight")[:-1]) + hf_parameter_name = f"base_model.model.model.{lora_filename}.default.weight" + if hf_parameter_name not in model.state_dict().keys(): + raise KeyError(f"Parameter {lora_file} not found in HF model.") + + ff_dtype = np.float32 if peft_config.precision == "fp32" else np.float16 + weight_path = os.path.join(weight_folder, lora_file) + # LoRA_A: [in_dim, rank] + # LoRA_B: [rank, out_dim] + if "lora_A" in lora_file: + weight_data = [] + for shard_id in range(num_shards): + weight_path_shard = weight_path.replace("shard_0", f"shard_{shard_id}") + weight_data_shard = np.fromfile(weight_path_shard, dtype=ff_dtype) + print("===in_dim:", in_dim) + print("===out_dim:", out_dim) + print("===rank:", peft_config.rank) + print("===num_shards:", num_shards) + weight_data_shard = weight_data_shard.reshape( + (in_dim // num_shards, peft_config.rank), order="F" + ) + weight_data.append(weight_data_shard) + weight_data = np.concatenate(weight_data, axis=0).T + elif "lora_B" in lora_file: + weight_data = np.fromfile(weight_path, dtype=ff_dtype) + weight_data = weight_data.reshape((peft_config.rank, out_dim), order="F").T + weight_tensor = torch.from_numpy(weight_data) + + param = model.state_dict()[hf_parameter_name] + + actual_numel = weight_tensor.numel() + expected_numel = param.numel() + if actual_numel != expected_numel: + raise ValueError( + f"Parameter {lora_file} has unexpected parameter count: {actual_numel} (actual) != {expected_numel} (expected)" + ) + + if weight_tensor.shape != param.shape: + raise ValueError( + f"Parameter {lora_file} has unexpected shape: {weight_tensor.shape} (actual) != {param.shape} (expected)" + ) + if weight_tensor.dtype != param.dtype: + raise ValueError( + f"Parameter {lora_file} has unexpected dtype: {weight_tensor.dtype} (actual) != {param.dtype} (expected)" + ) + + with torch.no_grad(): + param.copy_(weight_tensor) + + model.push_to_hub(f"{args.upload_peft_model_id}", use_auth_token=True, private=args.private) + + print("Upload process completed.") + + +if __name__ == "__main__": + main() diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index 2820cf485a..b8ed15eaea 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -88,7 +88,10 @@ "offload": "-offload", "offload_reserve_space_size": "-offload-reserve-space-size", "use_4bit_quantization": "--4bit-quantization", - "use_8bit_quantization": "--8bit-quantization" + "use_8bit_quantization": "--8bit-quantization", + "enable_peft": "-enable-peft", + "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size", + "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size", } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 14cf4eebf7..7692ccb88f 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -28,6 +28,8 @@ CompMode, MetricsType, InferenceMode, + RequestType, + OptimizerType, ModelType, OpType, ParameterSyncType, @@ -36,6 +38,9 @@ ) from flexflow.config import * from .flexflowlib import ffi, flexflow_library +from typing import Union, List +from peft import LoraConfig +import json def ffc(): @@ -1243,1009 +1248,935 @@ def get_weights(self, ffmodel): # ----------------------------------------------------------------------- -# FFModel +# SGDOptimizer # ----------------------------------------------------------------------- -class FFModel(object): - """ """ +class SGDOptimizer(object): + __slots__ = ["handle", "_handle"] - __slots__ = [ - "handle", - "_handle", - "_layers", - "_nb_layers", - "_ffconfig", - "_tracing_id", - "initializers", - "attr_tensors", - ] + def __init__( + self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0 + ): + self.handle = ffc().flexflow_sgd_optimizer_create( + ffmodel.handle, lr, momentum, nesterov, weight_decay + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) - def __init__(self, ffconfig): - """Constructor of FFModel. + def set_learning_rate(self, learning_rate): + ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) - :param ffconfig: configurations of FlexFlow and the created model. - :type ffconfig: FFConfig - :returns: FFModel -- the model. - """ - self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) - self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) - self._layers = dict() - self._nb_layers = 0 - self._ffconfig = ffconfig - global ff_tracing_id - self._tracing_id = ff_tracing_id - ff_tracing_id += 1 - self.initializers = {} - self.attr_tensors = {} +# ----------------------------------------------------------------------- +# AdamOptimizer +# ----------------------------------------------------------------------- - def get_layers(self): - return self._layers - def add_layer(self, op_type, name): - layer_id = self._nb_layers - op_handle = ffc().flexflow_model_get_last_layer(self.handle) - self._layers[self._nb_layers] = convert_op_handle_to_op( - op_type, op_handle, idx=layer_id, name=name +class AdamOptimizer(object): + __slots__ = ["handle", "_handle"] + + def __init__( + self, + ffmodel, + alpha=0.001, + beta1=0.9, + beta2=0.999, + weight_decay=0.0, + epsilon=1e-8, + ): + self.handle = ffc().flexflow_adam_optimizer_create( + ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon ) - self._nb_layers += 1 + self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) - def create_tensor(self, dims, data_type, create_grad=True): - """Instantiate a FlexFlow tensor. + def set_learning_rate(self, learning_rate): + ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) - :param x: a shape tuple/list (integers), including the batch size. - :type x: list of int - :param data_type: the datatype of the created tensor. Options are - DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. - :type data_type: DataType +# ----------------------------------------------------------------------- +# Initializer +# ----------------------------------------------------------------------- +class Initializer(object): + __slots__ = ["handle", "p_handle"] - :param create_grad: weather the tensor creates a gradients vector. - If you don't specify anything, a gradients vector is used. - :type create_grad: bool + def __init__(self, handle, p_handle=0): + self.p_handle = ffi.new("flexflow_initializer_t *") + if handle == None: + self.p_handle.impl = ffi.NULL + else: + self.p_handle.impl = handle.impl + self.handle = self.p_handle[0] + assert ffi.typeof(self.handle) == ffi.typeof( + "flexflow_initializer_t" + ), "Initializer handle is wrong" - :returns: Tensor -- the output tensor. - """ - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_tensor_create( - self.handle, num_dims, c_dims, c_data_type, create_grad - ) - return Tensor(handle) - def map_tensor(self, tensor, parallel_op=None): - op_handle = self.__get_op_handle(parallel_op) - ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) +# ----------------------------------------------------------------------- +# GlorotUniform +# ----------------------------------------------------------------------- - def create_constant(self, dims, value, data_type): - c_dims = ffi.new("int[]", dims) - c_data_type = enum_to_int(DataType, data_type) - num_dims = len(dims) - handle = ffc().flexflow_constant_create( - self.handle, num_dims, c_dims, value, c_data_type - ) - return Tensor(handle) - def exp(self, x, name=None): - """Exponential activation function. +class GlorotUniformInitializer(Initializer): + __slots__ = ["glorot_handle", "_glorot_handle"] - :param x: the input Tensor. - :type x: Tensor + def __init__(self, seed): + self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) + self._glorot_handle = ffi.gc( + self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy + ) + super(GlorotUniformInitializer, self).__init__(self.glorot_handle) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) - self.add_layer(OpType.EXP, name) - return Tensor(handle, owner_op_type=OpType.EXP) +# ----------------------------------------------------------------------- +# ZeroInitializer +# ----------------------------------------------------------------------- - def sin(self, x, name=None): - """Elementwise sine function. - :param x: the input Tensor. - :type x: Tensor +class ZeroInitializer(Initializer): + __slots__ = ["zero_handle", "_zero_handle"] - :param name: the name of the layer. Default is None. - :type name: string + def __init__(self): + self.zero_handle = ffc().flexflow_zero_initializer_create() + self._zero_handle = ffi.gc( + self.zero_handle, ffc().flexflow_zero_initializer_destroy + ) + super(ZeroInitializer, self).__init__(self.zero_handle) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) - self.add_layer(OpType.SIN, name) - return Tensor(handle, owner_op_type=OpType.SIN) - def cos(self, x, name=None): - """Elementwise cosine function. +# ----------------------------------------------------------------------- +# UniformInitializer +# ----------------------------------------------------------------------- - :param x: the input Tensor. - :type x: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class UniformInitializer(Initializer): + __slots__ = ["uniform_handle", "_uniform_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) - self.add_layer(OpType.COS, name) - return Tensor(handle, owner_op_type=OpType.COS) + def __init__(self, seed, minv, maxv): + self.uniform_handle = ffc().flexflow_uniform_initializer_create( + seed, minv, maxv + ) + self._uniform_handle = ffi.gc( + self.uniform_handle, ffc().flexflow_uniform_initializer_destroy + ) + super(UniformInitializer, self).__init__(self.uniform_handle) - def add(self, x, y, inplace_a=False, name=None): - """Layer that adds two input Tensors, :attr:`output = x + y`. - :param x: the first input Tensor. - :type x: Tensor +# ----------------------------------------------------------------------- +# NormInitializer +# ----------------------------------------------------------------------- - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +class NormInitializer(Initializer): + __slots__ = ["norm_handle", "_norm_handle"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_add( - self.handle, x.handle, y.handle, inplace_a, c_name + def __init__(self, seed, mean, stddev): + self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) + self._norm_handle = ffi.gc( + self.norm_handle, ffc().flexflow_norm_initializer_destroy ) - self.add_layer(OpType.ADD, name) - return Tensor(handle, owner_op_type=OpType.ADD) - - def subtract(self, x, y, inplace_a=False, name=None): - """Layer that subtracts two input Tensors, :attr:`output = x * y`. + super(NormInitializer, self).__init__(self.norm_handle) - :param x: the first input Tensor. - :type x: Tensor - :param y: the second input Tensor. - :type y: Tensor +# ----------------------------------------------------------------------- +# PerfMetrics +# ----------------------------------------------------------------------- - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_subtract( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.SUBTRACT, name) - return Tensor(handle, owner_op_type=OpType.SUBTRACT) +class PerfMetrics(object): + __slots__ = ["handle", "_handle"] - def multiply(self, x, y, inplace_a=False, name=None): - """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. + def __init__(self, handle): + self.handle = handle + self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) - :param x: the first input Tensor. - :type x: Tensor + def get_accuracy(self): + return ffc().flexflow_per_metrics_get_accuracy(self.handle) - :param y: the second input Tensor. - :type y: Tensor - :param name: the name of the layer. Default is None. - :type name: string +# ----------------------------------------------------------------------- +# NetConfig +# ----------------------------------------------------------------------- - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_multiply( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.MULTIPLY) - def divide(self, x, y, inplace_a=False, name=None): - """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. - - :param x: the first input Tensor. - :type x: Tensor - - :param y: the second input Tensor. - :type y: Tensor +class NetConfig(object): + def __init__(self): + self.handle = ffc().flexflow_net_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) + cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cpath) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_divide( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.DIVIDE, name) - return Tensor(handle, owner_op_type=OpType.DIVIDE) +# ----------------------------------------------------------------------- +# DLRMConfig +# ----------------------------------------------------------------------- - def max(self, x, y, inplace_a=False, name=None): - """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. - :param x: the first input Tensor. - :type x: Tensor +class DLRMConfig(object): + def __init__(self): + self.handle = ffc().flexflow_dlrm_config_create() + self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) - :param y: the second input Tensor. - :type y: Tensor + cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) + self.dataset_path = ffi.string(cstr) - :param name: the name of the layer. Default is None. - :type name: string + cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) + self.arch_interaction_op = ffi.string(cstr) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_max( - self.handle, x.handle, y.handle, inplace_a, c_name + self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size( + self.handle ) - self.add_layer(OpType.MAX, name) - return Tensor(handle, owner_op_type=OpType.MAX) + self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) + self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) + self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size( + self.handle + ) + self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) - def min(self, x, y, inplace_a=False, name=None): - """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. + mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) + self.mlp_bot = [] + for i in range(0, mlp_bot_c[0]): + self.mlp_bot.append(mlp_bot_c[i + 1]) - :param x: the first input Tensor. - :type x: Tensor + mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) + self.mlp_top = [] + for i in range(0, mlp_top_c[0]): + self.mlp_top.append(mlp_top_c[i + 1]) - :param y: the second input Tensor. - :type y: Tensor + embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) + self.embedding_size = [] + for i in range(0, embedding_size_c[0]): + self.embedding_size.append(embedding_size_c[i + 1]) - :param name: the name of the layer. Default is None. - :type name: string - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_min( - self.handle, x.handle, y.handle, inplace_a, c_name - ) - self.add_layer(OpType.MIN, name) - return Tensor(handle, owner_op_type=OpType.MIN) +# ----------------------------------------------------------------------- +# Single DataLoader +# ----------------------------------------------------------------------- - def reduce_sum(self, input, axes, keepdims=False, name=None): - """Layer that computes the sum of the input Tensor along given axes. - :param input: the input Tensor. - :type input: Tensor +class SingleDataLoader(object): + __slots__ = ["handle", "_handle"] - :param axes: the axes along which reduction is applied - :type axes: List[int] + def __init__(self, ffmodel, input, full_input, num_samples, data_type): + assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" + assert type(input) is Tensor, "SingleDataLoader input is wrong" + if type(full_input) is Tensor: + self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) + else: + self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) + self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) - :param name: the name of the layer. Default is None. - :type name: string + def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): + assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create( + ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type + ) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_reduce_sum( - self.handle, input.handle, c_axes, len(axes), keepdims, c_name + def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): + # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" + c_data_type = enum_to_int(DataType, data_type) + self.handle = ffc().flexflow_single_dataloader_create2( + ffmodel.handle, input.handle, full_input, num_samples, c_data_type ) - self.add_layer(OpType.REDUCE_SUM, name) - return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) - def rsqrt(self, input, name=None): - """Layer that computes the element-wise reciprocal square-root. + @property + def num_samples(self): + return ffc().flexflow_single_dataloader_get_num_samples(self.handle) - :param input: the input Tensor. - :type input: Tensor + @num_samples.setter + def num_samples(self, samples): + ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) - :param name: the name of the layer. Default is None. - :type name: string + def next_batch(self, ffmodel): + """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. - :returns: Tensor -- the output tensor. + :returns: None -- no returns. """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) - self.add_layer(OpType.RSQRT, name) - return Tensor(handle, owner_op_type=OpType.RSQRT) + ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) - def pow(self, input, exponent, name=None): - """Layer that computes the element-wise power. + def reset(self): + """Reset the current position of the dataloder to 0. - :param input: the input Tensor. - :type input: Tensor + :returns: None -- no returns. + """ + ffc().flexflow_single_dataloader_reset(self.handle) - :param exponent: exponent to raise each element in the input tensor. - :type exponent: float - :param name: the name of the layer. Default is None. - :type name: string +class RegionNdarray(object): + __slots__ = ["__array_interface__"] - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_pow( - self.handle, input.handle, exponent, c_name - ) - self.add_layer(OpType.POW, name) - return Tensor(handle, owner_op_type=OpType.POW) + def __init__(self, shape, data_type, base_ptr, strides, read_only): + # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html + if data_type == DataType.DT_HALF: + field_type = " 0: + raise ValueError( + "Target modules can only be specified when init_lora_weights=True" + ) + else: + if init_lora_weights: + raise ValueError( + "LORA weights initialization from scratch not supported in inference model" + ) + if len(target_modules) > 0: + raise ValueError( + "Target modules can only be specified when trainable=True" + ) + + # Check rank, lora_alpha, lora_dropout values + if rank is not None or lora_alpha is not None or lora_dropout is not None: + if not trainable or not init_lora_weights: + raise ValueError( + "rank, lora_alpha, and lora_dropout can only be set when trainable=True and init_lora_weights=True" + ) + rank = rank if rank is not None else 8 + lora_alpha = lora_alpha if lora_alpha is not None else 8.0 + lora_dropout = lora_dropout if lora_dropout is not None else 0.0 + + # If passed, check if the values of rank, lora_alpha, and lora_dropout are valid + if rank < 1 or type(rank) != int: + raise ValueError("Rank must be >= 1 and an integer") + if lora_alpha <= 0: + raise ValueError("Lora_alpha must be > 0") + if lora_dropout < 0 or lora_dropout > 1: + raise ValueError("Lora_dropout must be in the interval [0, 1]") + + self.ff_initialized = False + self._cache_folder = cache_folder + self._peft_model_id = peft_model_id + self._trainable = trainable + self._init_lora_weights = init_lora_weights + self._base_model_name_or_path = base_model_name_or_path + self._precision = precision + self._rank = rank + self._lora_alpha = lora_alpha + self._lora_dropout = lora_dropout + self._target_modules = target_modules + self.optimizer_type = optimizer_type + self.optimizer_kwargs = optimizer_kwargs + + def ff_compile(self): + c_cache_folder = get_c_name(os.path.expanduser(self.cache_folder)) + peft_model_id = get_c_name(self.peft_model_id) + base_model_name_or_path = get_c_name(self.base_model_name_or_path) + precision = get_c_name(self.precision) + c_target_modules = [ + get_c_name(target_module) for target_module in self.target_modules + ] + c_optimizer_type = enum_to_int(OptimizerType, self.optimizer_type) + # SGD optional optimizer args + sgd_learning_rate = self.optimizer_kwargs.get("learning_rate", 0.001) + sgd_momentum = self.optimizer_kwargs.get("momentum", 0.0) + sgd_nesterov = self.optimizer_kwargs.get("nesterov", False) + sgd_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0) + # Adam optional optimizer args + adam_alpha = self.optimizer_kwargs.get("alpha", 0.001) + adam_beta1 = self.optimizer_kwargs.get("beta1", 0.9) + adam_beta2 = self.optimizer_kwargs.get("beta2", 0.999) + adam_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0) + adam_epsilon = self.optimizer_kwargs.get("epsilon", 1e-8) + self.handle = ffc().flexflow_lora_linear_config_create( + c_cache_folder, + peft_model_id, + self.trainable, + self.init_lora_weights, + base_model_name_or_path, + precision, + self.rank, + self.lora_alpha, + self.lora_dropout, + len(self.target_modules), + c_target_modules, + c_optimizer_type, + sgd_learning_rate, + sgd_momentum, + sgd_nesterov, + sgd_weight_decay, + adam_alpha, + adam_beta1, + adam_beta2, + adam_weight_decay, + adam_epsilon, + ) + self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy) + self.ff_initialized = True + + @classmethod + def from_jsonfile(self, jsonfile: str): + with open(jsonfile, "r") as file: + config = json.load(file) + config_dict = dict(config) + config_dict["optimizer_type"] = OptimizerType.OPTIMIZER_TYPE_SGD + return LoraLinearConfig(**config_dict) + + def to_hf_config(self) -> LoraConfig: + return LoraConfig( + base_model_name_or_path=self.base_model_name_or_path, + r=self.rank, + target_modules=self.target_modules, + lora_alpha=self.lora_alpha, + lora_dropout=self.lora_dropout, + ) - :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. - :type padding_h: int + @property + def cache_folder(self): + if self.ff_initialized: + c_cache_folder = ffc().flexflow_lora_linear_config_get_cache_folder( + self.handle + ) + return ffi.string(c_cache_folder).decode("utf-8") + else: + return self._cache_folder - :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. - :type padding_w: int + @property + def peft_model_id(self): + if self.ff_initialized: + c_peft_model_id = ffc().flexflow_lora_linear_config_get_peft_model_id( + self.handle + ) + return ffi.string(c_peft_model_id).decode("utf-8") + else: + return self._peft_model_id - :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied. - :type activation: PoolType + @property + def rank(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_rank(self.handle) + else: + return self._rank - :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. - :type activation: ActiMode + @property + def lora_alpha(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_lora_alpha(self.handle) + else: + return self._lora_alpha - :param name: the name of the layer. Default is None. - :type name: string + @property + def lora_dropout(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_lora_dropout(self.handle) + else: + return self._lora_dropout - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - c_pool_type = enum_to_int(PoolType, pool_type) - c_activation = enum_to_int(ActiMode, activation) - handle = ffc().flexflow_model_add_pool2d( - self.handle, - input.handle, - kernel_h, - kernel_w, - stride_h, - stride_w, - padding_h, - padding_w, - c_pool_type, - c_activation, - c_name, - ) - self.add_layer(OpType.POOL2D, name) - return Tensor(handle, owner_op_type=OpType.POOL2D) + @property + def trainable(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_trainable(self.handle) + else: + return self._trainable - def batch_norm(self, input, relu=True, name=None): - """Layer that normalizes its inputs. + @property + def init_lora_weights(self): + if self.ff_initialized: + return ffc().flexflow_lora_linear_config_get_init_lora_weights(self.handle) + else: + return self._init_lora_weights - Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1. + @property + def base_model_name_or_path(self): + if self.ff_initialized: + c_base_model_name_or_path = ( + ffc().flexflow_lora_linear_config_get_base_model_name_or_path( + self.handle + ) + ) + return ffi.string(c_base_model_name_or_path).decode("utf-8") + else: + return self._base_model_name_or_path - :param input: the list of input Tensors. - :type input: Tensor + @property + def precision(self): + if self.ff_initialized: + c_precision = ffc().flexflow_lora_linear_config_get_precision(self.handle) + return ffi.string(c_precision).decode("utf-8") + else: + return self._precision - :param relu: whether a ReLU function is applied. Default is True. - :type relu: bool + @property + def target_modules(self): + if self.ff_initialized: + num_target_modules = ffi.new("int *") + c_target_modules = ffc().flexflow_lora_linear_config_get_target_modules( + self.handle, num_target_modules + ) + target_modules = [] + for i in range(num_target_modules[0]): + target_modules.append(ffi.string(c_target_modules[i]).decode("utf-8")) + return target_modules + else: + return self._target_modules - :param name: the name of the layer. Default is None. - :type name: string + @cache_folder.setter + def cache_folder(self, value: str): + self._cache_folder = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_cache_folder(self.handle, value) - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_batch_norm( - self.handle, input.handle, relu, c_name - ) - self.add_layer(OpType.BATCH_NORM, name) - return Tensor(handle, owner_op_type=OpType.BATCH_NORM) + @peft_model_id.setter + def peft_model_id(self, value: str): + self._peft_model_id = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_peft_model_id(self.handle, value) - def layer_norm( - self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None - ): - """Add a LayerNorm layer + @rank.setter + def rank(self, value: int): + self._rank = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_rank(self.handle, value) - :param input: The input tensor - :type input: Tensor - :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over - :type axes: Union[int, List[int]] - :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True - :type elementwise_affine: bool, optional - :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 - :type eps: float, optional - :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True - :type use_bias: bool, optional - :param name: Name of the operator, also used for loading weights in inference mode, defaults to None - :type name: _type_, optional - :return: The LayerNorm output tensor - :rtype: Tensor - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handle = ffc().flexflow_model_add_layer_norm( - self.handle, - input.handle, - len(axes), - c_axes, - elementwise_affine, - eps, - use_bias, - c_name, - ) - self.add_layer(OpType.LAYER_NORM, name) - return Tensor(handle, owner_op_type=OpType.LAYER_NORM) + @lora_alpha.setter + def lora_alpha(self, value: float): + self._lora_alpha = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_lora_alpha(self.handle, value) - def residual_layer_norm( - self, - input, - residual1, - residual2, - use_two_residuals, - axes, - elementwise_affine=True, - eps=1e-5, - use_bias=True, - name=None, - ): - """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in - better efficiency compared to using separate element-wise add and LayerNorm operators. + @lora_dropout.setter + def lora_dropout(self, value: float): + self._lora_dropout = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_lora_dropout(self.handle, value) - :param input: The input tensor - :type input: Tensor - :param residual1: The residual tensor to add to the input before computing the LayerNorm - :type residual1: Tensor - :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm - :type residual2: Tensor - :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise - :type use_two_residuals: bool - :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over - :type axes: List[int] - :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True - :type elementwise_affine: bool, optional - :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 - :type eps: float, optional - :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True - :type use_bias: bool, optional - :param name: Name of the operator, also used for loading weights in inference mode, defaults to None - :type name: str, optional - :return: A tensor with the sum of the input and residual(s), and the LayerNorm output - :rtype: (Tensor, Tensor) - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - residual2_handle = ( - residual1.handle - ) # This is intentional. Data will be ignored, and we cannot pass None - if use_two_residuals: - assert residual2 is not None - residual2_handle = residual2.handle - handles_array = ffc().flexflow_model_add_residual_layer_norm( - self.handle, - input.handle, - residual1.handle, - residual2_handle, - use_two_residuals, - len(axes), - c_axes, - elementwise_affine, - eps, - use_bias, - c_name, - ) - self.add_layer(OpType.RESIDUAL_LAYERNORM, name) - return Tensor( - handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM - ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM) + @trainable.setter + def trainable(self, value: bool): + self._trainable = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_trainable(self.handle, value) - def add_bias_residual_layer_norm( - self, - input, - residual, - axes, - elementwise_affine=True, - eps=1e-5, - use_bias=True, - name=None, - ): - """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, - resulting in better efficiency compared to using separate attention bias addition + - element-wise residual addition + LayerNorm operators. + @init_lora_weights.setter + def init_lora_weights(self, value: bool): + self._init_lora_weights = value + if self.ff_initialized: + ffc().flexflow_lora_linear_config_set_init_lora_weights(self.handle, value) - :param input: The input tensor - :type input: Tensor - :param residual: The residual tensor - :type residual: Tensor - :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over - :type axes: Union[int, List[int]] - :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True - :type elementwise_affine: bool, optional - :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 - :type eps: float, optional - :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True - :type use_bias: bool, optional - :param name: Name of the operator, also used for loading weights in inference mode, defaults to None - :type name: _type_, optional - :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output - :rtype: (Tensor, Tensor) - """ - c_name = get_c_name(name) - c_axes = ffi.new("int[]", axes) - handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm( - self.handle, - input.handle, - residual.handle, - len(axes), - c_axes, - elementwise_affine, - eps, - use_bias, - c_name, - ) - self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) - return Tensor( - handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM - ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM) - def sigmoid_silu_multi(self, input1, input2, name=None): - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sigmoid_silu_multi( - self.handle, input1.handle, input2.handle, c_name - ) - self.add_layer(OpType.SIGMOID_SILU_MULTI, name) - return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) +# ----------------------------------------------------------------------- +# PEFTModelID +# ----------------------------------------------------------------------- - def batch_matmul( - self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None - ): - """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. - :param A: the first input Tensor. - :type A: Tensor +class PEFTModelID(object): + __slots__ = ["handle", "_handle"] - :param B: the second input Tensor. - :type B: Tensor + __no_id_h = None - :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension - :type a_seq_length_dim: int + def __init__(self, id=None): + if id is None: + self.handle = ffc().flexflow_peft_model_id_create() + else: + self.handle = ffc().flexflow_peft_model_id_create_id(id) + self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy) - :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension - :type b_seq_length_dim: int + @staticmethod + def no_id_handle(): + if PEFTModelID.__no_id_h is None: + PEFTModelID.__no_id_h = ffc().flexflow_peft_model_id_no_id() + return PEFTModelID.__no_id_h - :param name: the name of the layer. Default is None. - :type name: string - :param name: Whether to add use bias in layer normalization - :type name: bool +# ----------------------------------------------------------------------- +# Request +# ----------------------------------------------------------------------- - :returns: Tensor -- the output tensor. - """ - if a_seq_length_dim is None: - a_seq_length_dim = -1 - if b_seq_length_dim is None: - b_seq_length_dim = -1 - handle = ffc().flexflow_model_add_batch_matmul( - self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim - ) - self.add_layer(OpType.BATCH_MATMUL, name) - return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL) - def dense( +class Request: + """A class to record the metadata of an inference or finetuning request.""" + + def __init__( self, - input, - out_dim, - activation=ActiMode.AC_MODE_NONE, - use_bias=True, - datatype=DataType.DT_NONE, - shared_op=None, - kernel_initializer=None, - bias_initializer=None, - kernel_regularizer=None, - name=None, + req_type: RequestType, + prompt: str = None, + max_sequence_length: int = 128, + peft_model_id: PEFTModelID = None, + dataset_filepath: str = None, + max_training_steps: int = 1, ): - """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where - :attr:`activation` is the element-wise activation function passed as the activation argument, - :attr:`kernel` is a weights matrix created by the layer, and - :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True). + self.req_type = req_type + self.prompt = prompt + self.max_sequence_length = max_sequence_length + self.peft_model_id = peft_model_id + self.dataset_filepath = dataset_filepath + self.max_training_steps = max_training_steps - The size of input tensor is :math:`(N, C_{in})` and the size of output tensor - is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim` - - :param input: the input Tensor. - :type input: Tensor - :param out\_dim: dimensionality of the output space. - :type out\_dim: int +# ----------------------------------------------------------------------- +# FFModel +# ----------------------------------------------------------------------- - :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. - :type activation: ActiMode - :param use_bias: whether the layer uses a bias vector. Default is True. - :type use_bias: bool +class FFModel(object): + """ """ - :param shared_op: the layer whose parameters are shared with. Default is None. - :type shared_op: Op + __slots__ = [ + "handle", + "_handle", + "_layers", + "_nb_layers", + "_ffconfig", + "_tracing_id", + "initializers", + "attr_tensors", + ] - :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + def __init__(self, ffconfig): + """Constructor of FFModel. - :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. - :type bias_initializer: Initializer + :param ffconfig: configurations of FlexFlow and the created model. + :type ffconfig: FFConfig - :param kernel_regularizer: Regularizer for the kernel weights matrix - :type bias_initializer: Regularizer + :returns: FFModel -- the model. + """ + self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload) + self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy) + self._layers = dict() + self._nb_layers = 0 + self._ffconfig = ffconfig + global ff_tracing_id + self._tracing_id = ff_tracing_id + ff_tracing_id += 1 + self.initializers = {} + self.attr_tensors = {} - :param name: the name of the layer. Default is None. - :type name: string + def get_layers(self): + return self._layers - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - shared_op_handle = self.__get_op_handle(shared_op) - c_activation = enum_to_int(ActiMode, activation) - c_datatype = enum_to_int(DataType, datatype) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - bias_init_handle = self.__get_initializer_handle(bias_initializer) - if kernel_regularizer: - c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type) - kernel_reg_lambda = kernel_regularizer._lambda - else: - c_kernel_reg_type = enum_to_int( - RegularizerMode, RegularizerMode.REG_MODE_NONE - ) - kernel_reg_lambda = 0.0 - handle = ffc().flexflow_model_add_dense( - self.handle, - input.handle, - out_dim, - c_activation, - use_bias, - c_datatype, - shared_op_handle, - kernel_init_handle, - bias_init_handle, - c_kernel_reg_type, - kernel_reg_lambda, - c_name, + def add_layer(self, op_type, name): + layer_id = self._nb_layers + op_handle = ffc().flexflow_model_get_last_layer(self.handle) + self._layers[self._nb_layers] = convert_op_handle_to_op( + op_type, op_handle, idx=layer_id, name=name ) - self.add_layer(OpType.LINEAR, name) - return Tensor(handle, owner_op_type=OpType.LINEAR) - - def concat(self, tensors, axis, name=None): - """Layer that concatenates a list of inputs. + self._nb_layers += 1 - It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs. + def create_tensor(self, dims, data_type, create_grad=True): + """Instantiate a FlexFlow tensor. - :param input: the list of input Tensors. - :type input: List of Tensors + :param x: a shape tuple/list (integers), including the batch size. + :type x: list of int - :param axis: the dimension along which to concatenate. - :type axis: int + :param data_type: the datatype of the created tensor. Options are + DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN. + :type data_type: DataType - :param name: the name of the layer. Default is None. - :type name: string + :param create_grad: weather the tensor creates a gradients vector. + If you don't specify anything, a gradients vector is used. + :type create_grad: bool :returns: Tensor -- the output tensor. """ - assert type(tensors) is list, "tensors should be a list" - tensor_handle_list = [] - n = len(tensors) - assert n <= 256, "Please increase MAX_NUM_INPUTS" - for tensor in tensors: - tensor_handle_list.append(tensor.handle) - c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list) - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_concat( - self.handle, n, c_tensor_handle_list, axis, c_name + c_dims = ffi.new("int[]", dims) + c_data_type = enum_to_int(DataType, data_type) + num_dims = len(dims) + handle = ffc().flexflow_tensor_create( + self.handle, num_dims, c_dims, c_data_type, create_grad ) - self.add_layer(OpType.CONCAT, name) - return Tensor(handle, owner_op_type=OpType.CONCAT) + return Tensor(handle) - def split(self, input, sizes, axis, name=None): - """Layer that splits a :attr:`input` tensor into a list of tensors. + def map_tensor(self, tensor, parallel_op=None): + op_handle = self.__get_op_handle(parallel_op) + ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle) - :param input: the input Tensor. - :type input: Tensor + def create_constant(self, dims, value, data_type): + c_dims = ffi.new("int[]", dims) + c_data_type = enum_to_int(DataType, data_type) + num_dims = len(dims) + handle = ffc().flexflow_constant_create( + self.handle, num_dims, c_dims, value, c_data_type + ) + return Tensor(handle) - :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`. - :type sizes: int or list of int + def exp(self, x, name=None): + """Exponential activation function. - :param axis: the dimension along which to split. - :type axis: int + :param x: the input Tensor. + :type x: Tensor :param name: the name of the layer. Default is None. :type name: string - :returns: list of Tensors -- the output tensors. + :returns: Tensor -- the output tensor. """ - if type(sizes) is list: - split = sizes - else: - assert input.dims[axis] % sizes == 0, "Split dimension is not divisible" - split = [input.dims[axis] // sizes for i in range(sizes)] - n = len(split) - assert n <= 256, "Please increase MAX_NUM_OUTPUTS" - c_split = ffi.new("int[]", split) - c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]") c_name = get_c_name(name) - ffc().flexflow_model_add_split( - self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name - ) - output_tensor_list = [] - for i in range(n): - tensor_p_handle = ffi.new("flexflow_tensor_t*") - tensor_p_handle.impl = c_outputs_handle_list[i].impl - output_tensor_list.append( - Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle) - ) - self.add_layer(OpType.SPLIT, name) - del c_outputs_handle_list - return output_tensor_list + handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name) + self.add_layer(OpType.EXP, name) + return Tensor(handle, owner_op_type=OpType.EXP) - def flat(self, input, name=None): - """Flattens the input. Does not affect the batch size. + def sin(self, x, name=None): + """Elementwise sine function. - :param input: the input Tensor. - :type input: Tensor + :param x: the input Tensor. + :type x: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2253,15 +2184,15 @@ def flat(self, input, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name) - self.add_layer(OpType.FLAT, name) - return Tensor(handle, owner_op_type=OpType.FLAT) + handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name) + self.add_layer(OpType.SIN, name) + return Tensor(handle, owner_op_type=OpType.SIN) - def softmax(self, input, axis=-1, name=None): - """Softmax activation function. + def cos(self, x, name=None): + """Elementwise cosine function. - :param input: the input Tensor. - :type input: Tensor + :param x: the input Tensor. + :type x: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2269,23 +2200,18 @@ def softmax(self, input, axis=-1, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_softmax( - self.handle, input.handle, axis, c_name - ) - self.add_layer(OpType.SOFTMAX, name) - return Tensor(handle, owner_op_type=OpType.SOFTMAX) - - def reshape(self, input, shape, name=None): - """Layer that reshapes inputs into the given shape. + handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name) + self.add_layer(OpType.COS, name) + return Tensor(handle, owner_op_type=OpType.COS) - Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order, - except with a new shape given by :attr:`shape`. + def add(self, x, y, inplace_a=False, name=None): + """Layer that adds two input Tensors, :attr:`output = x + y`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param shape: A list defining the shape of the output tensor. - :type shape: list of int + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2293,45 +2219,41 @@ def reshape(self, input, shape, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - c_shape = ffi.new("int[]", shape) - handle = ffc().flexflow_model_add_reshape( - self.handle, input.handle, len(shape), c_shape, c_name + handle = ffc().flexflow_model_add_add( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.RESHAPE, name) - return Tensor(handle, owner_op_type=OpType.RESHAPE) - - def gather(self, input, index, dim, name=None): - """Layer that gathers values along the dim axis. + self.add_layer(OpType.ADD, name) + return Tensor(handle, owner_op_type=OpType.ADD) - :param input: the input tensor - :type input: Tensor + def subtract(self, x, y, inplace_a=False, name=None): + """Layer that subtracts two input Tensors, :attr:`output = x * y`. - :param index: the index tensor, which specifies the indices of elements to gather - :type index: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param dim: the axis along which to index - :type dim: int + :param y: the second input Tensor. + :type y: Tensor - :param name: the name of the layer. Default is None + :param name: the name of the layer. Default is None. :type name: string - :returns: Tensor -- the output tensor + :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_gather( - self.handle, input.handle, index.handle, dim, c_name + handle = ffc().flexflow_model_add_subtract( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.GATHER, name) - return Tensor(handle, owner_op_type=OpType.GATHER) + self.add_layer(OpType.SUBTRACT, name) + return Tensor(handle, owner_op_type=OpType.SUBTRACT) - def transpose(self, input, perm, name=None): - """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm + def multiply(self, x, y, inplace_a=False, name=None): + """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param perm: A permutation of the dimensions of a. - :type perm: List of int + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2339,23 +2261,20 @@ def transpose(self, input, perm, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - c_perm = ffi.new("int[]", perm) - handle = ffc().flexflow_model_add_transpose( - self.handle, input.handle, len(perm), c_perm, c_name + handle = ffc().flexflow_model_add_multiply( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.TRANSPOSE, name) - return Tensor(handle, owner_op_type=OpType.TRANSPOSE) - - def reverse(self, input, axis, name=None): - """Layer that reverses specific dimensions of a tensor. + self.add_layer(OpType.MULTIPLY, name) + return Tensor(handle, owner_op_type=OpType.MULTIPLY) - Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`. + def divide(self, x, y, inplace_a=False, name=None): + """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param axis: the dimension to reverse. - :type axis: int + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2363,20 +2282,20 @@ def reverse(self, input, axis, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_reverse( - self.handle, input.handle, axis, c_name + handle = ffc().flexflow_model_add_divide( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.REVERSE, name) - return Tensor(handle, owner_op_type=OpType.REVERSE) + self.add_layer(OpType.DIVIDE, name) + return Tensor(handle, owner_op_type=OpType.DIVIDE) - def scalar_multiply(self, input, scalar, inplace=True, name=None): - """Scalar multiplication of a tensor by an scalar. + def max(self, x, y, inplace_a=False, name=None): + """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param input: the scalar - :type scalar: float + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2384,20 +2303,20 @@ def scalar_multiply(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_multiply( - self.handle, input.handle, scalar, inplace, c_name + handle = ffc().flexflow_model_add_max( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.SCALAR_MULTIPLY, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY) + self.add_layer(OpType.MAX, name) + return Tensor(handle, owner_op_type=OpType.MAX) - def scalar_add(self, input, scalar, inplace=True, name=None): - """Scalar addition of a scalar to each entry of a tensor. + def min(self, x, y, inplace_a=False, name=None): + """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`. - :param input: the input Tensor. - :type input: Tensor + :param x: the first input Tensor. + :type x: Tensor - :param input: the scalar - :type scalar: float + :param y: the second input Tensor. + :type y: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -2405,20 +2324,20 @@ def scalar_add(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_add( - self.handle, input.handle, scalar, inplace, c_name + handle = ffc().flexflow_model_add_min( + self.handle, x.handle, y.handle, inplace_a, c_name ) - self.add_layer(OpType.SCALAR_ADD, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_ADD) + self.add_layer(OpType.MIN, name) + return Tensor(handle, owner_op_type=OpType.MIN) - def scalar_sub(self, input, scalar, inplace=True, name=None): - """Scalar subtraction of a scalar to each entry of a tensor. + def reduce_sum(self, input, axes, keepdims=False, name=None): + """Layer that computes the sum of the input Tensor along given axes. :param input: the input Tensor. :type input: Tensor - :param input: the scalar - :type scalar: float + :param axes: the axes along which reduction is applied + :type axes: List[int] :param name: the name of the layer. Default is None. :type name: string @@ -2426,215 +2345,234 @@ def scalar_sub(self, input, scalar, inplace=True, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_sub( - self.handle, input.handle, scalar, inplace, c_name + c_axes = ffi.new("int[]", axes) + handle = ffc().flexflow_model_add_reduce_sum( + self.handle, input.handle, c_axes, len(axes), keepdims, c_name ) - self.add_layer(OpType.SCALAR_SUB, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_SUB) + self.add_layer(OpType.REDUCE_SUM, name) + return Tensor(handle, owner_op_type=OpType.REDUCE_SUM) - def scalar_true_divide(self, input, scalar, inplace=True, name=None): - """Scalar regular division of a tensor by an scalar. + def rsqrt(self, input, name=None): + """Layer that computes the element-wise reciprocal square-root. :param input: the input Tensor. :type input: Tensor - :param input: the scalar - :type scalar: float - :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_scalar_truediv( - self.handle, input.handle, scalar, inplace, c_name - ) - self.add_layer(OpType.SCALAR_TRUEDIV, name) - return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV) + handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name) + self.add_layer(OpType.RSQRT, name) + return Tensor(handle, owner_op_type=OpType.RSQRT) - def gelu(self, input, inplace=True, name=None): - """Gaussian Error Linear Unit activation function. + def pow(self, input, exponent, name=None): + """Layer that computes the element-wise power. :param input: the input Tensor. :type input: Tensor + :param exponent: exponent to raise each element in the input tensor. + :type exponent: float + :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name) - self.add_layer(OpType.GELU, name) - return Tensor(handle, owner_op_type=OpType.GELU) + handle = ffc().flexflow_model_add_pow( + self.handle, input.handle, exponent, c_name + ) + self.add_layer(OpType.POW, name) + return Tensor(handle, owner_op_type=OpType.POW) - def relu(self, input, inplace=True, name=None): - """Rectified Linear Unit activation function. + def mean(self, input, dims, keepdims=False, name=None): + """Layer that computes the mean of the input tensor across the given + dimensions. :param input: the input Tensor. :type input: Tensor + :param dims: dimensions to take the mean over. + :type dims: list + + :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and + collapses the dimension if False. Default is False. + :type keepdims: bool + :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ + dims = list(dims) + c_dims = ffi.new("int[]", dims) c_name = get_c_name(name) - handle = ffc().flexflow_model_add_relu( - self.handle, input.handle, inplace, c_name + handle = ffc().flexflow_model_add_mean( + self.handle, input.handle, c_dims, len(dims), keepdims, c_name ) - self.add_layer(OpType.RELU, name) - return Tensor(handle, owner_op_type=OpType.RELU) + self.add_layer(OpType.MEAN, name) + return Tensor(handle, owner_op_type=OpType.MEAN) - def identity(self, input, name=None): - """Identity function. + def conv2d( + self, + input, + out_channels, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + activation=ActiMode.AC_MODE_NONE, + groups=1, + use_bias=True, + shared_op=None, + kernel_initializer=None, + bias_initializer=None, + name=None, + ): + """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input` + to produce a tensor of :attr:`output`. - :param input: the input Tensor. - :type input: Tensor + The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor + is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: - :param name: the name of the layer. Default is None. - :type name: string + .. math:: + C_{out} = out\_channels - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name) - self.add_layer(OpType.IDENTITY, name) - return Tensor(handle, owner_op_type=OpType.IDENTITY) + .. math:: + K_{H} = kernel\_h - def sigmoid(self, input, name=None): - """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`. + .. math:: + K_{W} = kernel\_w - :param input: the input Tensor. - :type input: Tensor + .. math:: + S_{H} = stride\_h - :param name: the name of the layer. Default is None. - :type name: string + .. math:: + S_{W} = stride\_w - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name) - self.add_layer(OpType.SIGMOID, name) - return Tensor(handle, owner_op_type=OpType.SIGMOID) + .. math:: + P_{H} = padding\_h - def tanh(self, input, name=None): - """Hyperbolic tangent activation function. + .. math:: + P_{S} = padding\_s + + .. math:: + H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 + + .. math:: + W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 :param input: the input Tensor. :type input: Tensor - :param name: the name of the layer. Default is None. - :type name: string + :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution). + :type out\_channels: int - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name) - self.add_layer(OpType.TANH, name) - return Tensor(handle, owner_op_type=OpType.TANH) + :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`. + :type kernel_h: int - def elu(self, input, inplace=True, name=None): - """Exponential Linear Unit. activation function. + :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`. + :type kernel_w: int - :param input: the input Tensor. - :type input: Tensor + :param stride_h: the stride of the convolution along the height: :math:`S_{H}`. + :type stride_h: int - :param name: the name of the layer. Default is None. - :type name: string + :param stride_w: the stride of the convolution along the width: :math:`S_{W}`. + :type stride_w: int - :returns: Tensor -- the output tensor. - """ - c_name = get_c_name(name) - handle = ffc().flexflow_model_add_elu( - self.handle, input.handle, inplace, c_name - ) - self.add_layer(OpType.ELU, name) - return Tensor(handle, owner_op_type=OpType.ELU) + :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. + :type padding_h: int - def dropout(self, input, rate, seed, name=None): - """The Dropout layer randomly sets input units to 0 with - a frequency of :attr:`rate` at each step during training time, - which helps prevent overfitting. - Inputs not set to 0 are scaled up by 1/(1 - rate) such that the - sum over all inputs is unchanged. + :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. + :type padding_w: int - :param input: the input Tensor. - :type input: Tensor + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode - :param rate: Fraction of the input units to drop. - :type rate: float(0-1) + :param groups: the number of groups in this convolution + :type groups: int - :param seed: random seed. - :type seed: int + :param use_bias: whether the layer uses a bias vector. Default is True. + :type use_bias: bool + + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op + + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. + :type bias_initializer: Initializer :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ + shared_op_handle = self.__get_op_handle(shared_op) + c_activation = enum_to_int(ActiMode, activation) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + bias_init_handle = self.__get_initializer_handle(bias_initializer) c_name = get_c_name(name) - handle = ffc().flexflow_model_add_dropout( - self.handle, input.handle, rate, seed, c_name + handle = ffc().flexflow_model_add_conv2d( + self.handle, + input.handle, + out_channels, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + c_activation, + groups, + use_bias, + shared_op_handle, + kernel_init_handle, + bias_init_handle, + c_name, ) - self.add_layer(OpType.DROPOUT, name) - return Tensor(handle, owner_op_type=OpType.DROPOUT) + self.add_layer(OpType.CONV2D, name) + return Tensor(handle, owner_op_type=OpType.CONV2D) - def multihead_attention( + def embedding( self, - query, - key, - value, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, + input, + num_embeddings, + embedding_dim, + aggr, + dtype=DataType.DT_FLOAT, + shared_op=None, kernel_initializer=None, name=None, ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, - and returns the dot-product attention between them:. - - :param query: the query Tensor. - :type query: Tensor - - :param key: the key Tensor. - :type key: Tensor - - :param value: the value Tensor. - :type value: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int + """Layer that turns positive integers into dense vectors of fixed size - :param kdim: total number of features in key. Default is 0 - :type kdim: int + :param input: the input Tensor. + :type input: Tensor - :param vdim: total number of features in value. Default is 0 - :type vdim: int + :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1 + :type num_embeddings: int - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + :param embedding_dim: dimension of the dense embedding. + :type embedding_dim: int - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG. + :type aggr: AggrMode - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE + :type dtype: DataType - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer :param name: the name of the layer. Default is None. @@ -2643,97 +2581,105 @@ def multihead_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - handle = ffc().flexflow_model_add_multihead_attention( + shared_op_handle = self.__get_op_handle(shared_op) + c_aggr = enum_to_int(AggrMode, aggr) + c_dtype = enum_to_int(DataType, dtype) + if kernel_initializer is None: + kernel_initializer = GlorotUniformInitializer(42) + assert ( + (type(kernel_initializer) is GlorotUniformInitializer) + or (type(kernel_initializer) is ZeroInitializer) + or (type(kernel_initializer) is UniformInitializer) + or (type(kernel_initializer) is NormInitializer) + ), f"Unknown initializer type: {kernel_initializer}" + handle = ffc().flexflow_model_add_embedding( self.handle, - query.handle, - key.handle, - value.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - kernel_init_handle, + input.handle, + num_embeddings, + embedding_dim, + c_aggr, + c_dtype, + shared_op_handle, + kernel_initializer.handle, c_name, ) - self.add_layer(OpType.MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) + # NOTE: We must keep a reference to the initializer or else it will be + # immediately destructed + self.initializers[name] = kernel_initializer + self.add_layer(OpType.EMBEDDING, name) + return Tensor(handle, owner_op_type=OpType.EMBEDDING) - def inc_multihead_self_attention( + def pool2d( self, input, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + pool_type=PoolType.POOL_MAX, + activation=ActiMode.AC_MODE_NONE, name=None, ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - In inference mode, the attention is computed using incremental decoding. + """Pooling operation for 2D spatial data. - :param input: the input Tensor. - :type input: Tensor + The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor + is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by: - :param embed_dim: total dimension of the model - :type embed_dim: int + .. math:: + C_{out} = out\_channels - :param num_heads: Number of attention heads. - :type num_heads: int + .. math:: + K_{H} = kernel\_h - :param kdim: total number of features in key. Default is 0 - :type kdim: int + .. math:: + K_{W} = kernel\_w - :param vdim: total number of features in value. Default is 0 - :type vdim: int + .. math:: + S_{H} = stride\_h - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + .. math:: + S_{W} = stride\_w - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + .. math:: + P_{H} = padding\_h - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + .. math:: + P_{S} = padding\_s - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + .. math:: + H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1 - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + .. math:: + W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1 - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + :param input: the input Tensor. + :type input: Tensor - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`. + :type kernel_h: int - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`. + :type kernel_w: int - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :param stride_h: the stride of the pooling along the height: :math:`S_{H}`. + :type stride_h: int - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param stride_w: the stride of the pooling along the width: :math:`S_{W}`. + :type stride_w: int - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`. + :type padding_h: int + + :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`. + :type padding_w: int + + :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied. + :type activation: PoolType + + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode :param name: the name of the layer. Default is None. :type name: string @@ -2741,102 +2687,34 @@ def inc_multihead_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention( + c_pool_type = enum_to_int(PoolType, pool_type) + c_activation = enum_to_int(ActiMode, activation) + handle = ffc().flexflow_model_add_pool2d( self.handle, input.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, + kernel_h, + kernel_w, + stride_h, + stride_w, + padding_h, + padding_w, + c_pool_type, + c_activation, c_name, ) - self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) - - def spec_inc_multihead_self_attention( - self, - input, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, - name=None, - ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (beam search) mode. - - :param input: the input Tensor. - :type input: Tensor - - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType - - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer - - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + self.add_layer(OpType.POOL2D, name) + return Tensor(handle, owner_op_type=OpType.POOL2D) - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + def batch_norm(self, input, relu=True, name=None): + """Layer that normalizes its inputs. - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1. - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param input: the list of input Tensors. + :type input: Tensor - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param relu: whether a ReLU function is applied. Default is True. + :type relu: bool :param name: the name of the layer. Default is None. :type name: string @@ -2844,209 +2722,255 @@ def spec_inc_multihead_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention( + handle = ffc().flexflow_model_add_batch_norm( + self.handle, input.handle, relu, c_name + ) + self.add_layer(OpType.BATCH_NORM, name) + return Tensor(handle, owner_op_type=OpType.BATCH_NORM) + + def layer_norm( + self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None + ): + """Add a LayerNorm layer + + :param input: The input tensor + :type input: Tensor + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: Union[int, List[int]] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: _type_, optional + :return: The LayerNorm output tensor + :rtype: Tensor + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handle = ffc().flexflow_model_add_layer_norm( self.handle, input.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, c_name, ) - self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.LAYER_NORM, name) + return Tensor(handle, owner_op_type=OpType.LAYER_NORM) - def inc_multihead_self_attention_verify( + def residual_layer_norm( self, input, - embed_dim, - num_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, + residual1, + residual2, + use_two_residuals, + axes, + elementwise_affine=True, + eps=1e-5, + use_bias=True, + inplace_residual=False, name=None, ): - """Defines the MultiHead Attention operation as described in Attention Is All You Need - which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (tree verify) mode. + """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in + better efficiency compared to using separate element-wise add and LayerNorm operators. - :param input: the input Tensor. + :param input: The input tensor :type input: Tensor + :param residual1: The residual tensor to add to the input before computing the LayerNorm + :type residual1: Tensor + :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm + :type residual2: Tensor + :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise + :type use_two_residuals: bool + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: List[int] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False + :type inplace_residual: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: str, optional + :return: A tensor with the sum of the input and residual(s), and the LayerNorm output + :rtype: (Tensor, Tensor) + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + residual2_handle = ( + residual1.handle + ) # This is intentional. Data will be ignored, and we cannot pass None + if use_two_residuals: + assert residual2 is not None + residual2_handle = residual2.handle + handles_array = ffc().flexflow_model_add_residual_layer_norm( + self.handle, + input.handle, + residual1.handle, + residual2_handle, + use_two_residuals, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, + inplace_residual, + c_name, + ) + self.add_layer(OpType.RESIDUAL_LAYERNORM, name) + return ( + Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM), + Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM), + ) - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_heads: Number of attention heads. - :type num_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool - - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + def add_bias_residual_layer_norm( + self, + input, + residual, + axes, + elementwise_affine=True, + eps=1e-5, + use_bias=True, + inplace_residual=False, + name=None, + ): + """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, + resulting in better efficiency compared to using separate attention bias addition + + element-wise residual addition + LayerNorm operators. - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + :param input: The input tensor + :type input: Tensor + :param residual: The residual tensor + :type residual: Tensor + :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over + :type axes: Union[int, List[int]] + :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True + :type elementwise_affine: bool, optional + :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5 + :type eps: float, optional + :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True + :type use_bias: bool, optional + :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False + :type inplace_residual: bool, optional + :param name: Name of the operator, also used for loading weights in inference mode, defaults to None + :type name: _type_, optional + :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output + :rtype: (Tensor, Tensor) + """ + c_name = get_c_name(name) + c_axes = ffi.new("int[]", axes) + handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm( + self.handle, + input.handle, + residual.handle, + len(axes), + c_axes, + elementwise_affine, + eps, + use_bias, + inplace_residual, + c_name, + ) + self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name) + return ( + Tensor(handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM), + Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM), + ) - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + def sigmoid_silu_multi(self, input1, input2, name=None): + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sigmoid_silu_multi( + self.handle, input1.handle, input2.handle, c_name + ) + self.add_layer(OpType.SIGMOID_SILU_MULTI, name) + return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI) - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + def batch_matmul( + self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None + ): + """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`. - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :param A: the first input Tensor. + :type A: Tensor - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param B: the second input Tensor. + :type B: Tensor - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension + :type a_seq_length_dim: int + + :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension + :type b_seq_length_dim: int :param name: the name of the layer. Default is None. :type name: string + :param name: Whether to add use bias in layer normalization + :type name: bool + :returns: Tensor -- the output tensor. """ - c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify( - self.handle, - input.handle, - embed_dim, - num_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - c_name, + if a_seq_length_dim is None: + a_seq_length_dim = -1 + if b_seq_length_dim is None: + b_seq_length_dim = -1 + handle = ffc().flexflow_model_add_batch_matmul( + self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim ) - self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.BATCH_MATMUL, name) + return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL) - def inc_multiquery_self_attention( + def dense( self, input, - embed_dim, - num_q_heads, - num_kv_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, + out_dim, + activation=ActiMode.AC_MODE_NONE, + use_bias=True, + datatype=DataType.DT_NONE, + shared_op=None, kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, + bias_initializer=None, + kernel_regularizer=None, name=None, ): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - In inference mode, the attention is computed using incremental decoding. + """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where + :attr:`activation` is the element-wise activation function passed as the activation argument, + :attr:`kernel` is a weights matrix created by the layer, and + :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True). + + The size of input tensor is :math:`(N, C_{in})` and the size of output tensor + is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim` :param input: the input Tensor. :type input: Tensor - :param embed_dim: total dimension of the model - :type embed_dim: int - - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int - - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int - - :param kdim: total number of features in key. Default is 0 - :type kdim: int - - :param vdim: total number of features in value. Default is 0 - :type vdim: int - - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) - - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param out\_dim: dimensionality of the output space. + :type out\_dim: int - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE. + :type activation: ActiMode - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :param use_bias: whether the layer uses a bias vector. Default is True. + :type use_bias: bool - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + :param shared_op: the layer whose parameters are shared with. Default is None. + :type shared_op: Op - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool - - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool - - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float - - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied. + :type bias_initializer: Initializer - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param kernel_regularizer: Regularizer for the kernel weights matrix + :type bias_initializer: Regularizer :param name: the name of the layer. Default is None. :type name: string @@ -3054,107 +2978,128 @@ def inc_multiquery_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) + shared_op_handle = self.__get_op_handle(shared_op) + c_activation = enum_to_int(ActiMode, activation) + c_datatype = enum_to_int(DataType, datatype) kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention( - self.handle, - input.handle, - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, + bias_init_handle = self.__get_initializer_handle(bias_initializer) + if kernel_regularizer: + c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type) + kernel_reg_lambda = kernel_regularizer._lambda + else: + c_kernel_reg_type = enum_to_int( + RegularizerMode, RegularizerMode.REG_MODE_NONE + ) + kernel_reg_lambda = 0.0 + handle = ffc().flexflow_model_add_dense( + self.handle, + input.handle, + out_dim, + c_activation, + use_bias, + c_datatype, + shared_op_handle, kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, + bias_init_handle, + c_kernel_reg_type, + kernel_reg_lambda, c_name, ) - self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + self.add_layer(OpType.LINEAR, name) + return Tensor(handle, owner_op_type=OpType.LINEAR) - def spec_inc_multiquery_self_attention( - self, - input, - embed_dim, - num_q_heads, - num_kv_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, - name=None, - ): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (beam search) mode. + def concat(self, tensors, axis, name=None): + """Layer that concatenates a list of inputs. - :param input: the input Tensor. - :type input: Tensor + It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs. - :param embed_dim: total dimension of the model - :type embed_dim: int + :param input: the list of input Tensors. + :type input: List of Tensors - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int + :param axis: the dimension along which to concatenate. + :type axis: int - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int + :param name: the name of the layer. Default is None. + :type name: string - :param kdim: total number of features in key. Default is 0 - :type kdim: int + :returns: Tensor -- the output tensor. + """ + assert type(tensors) is list, "tensors should be a list" + tensor_handle_list = [] + n = len(tensors) + assert n <= 256, "Please increase MAX_NUM_INPUTS" + for tensor in tensors: + tensor_handle_list.append(tensor.handle) + c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_concat( + self.handle, n, c_tensor_handle_list, axis, c_name + ) + self.add_layer(OpType.CONCAT, name) + return Tensor(handle, owner_op_type=OpType.CONCAT) - :param vdim: total number of features in value. Default is 0 - :type vdim: int + def split(self, input, sizes, axis, name=None): + """Layer that splits a :attr:`input` tensor into a list of tensors. - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + :param input: the input Tensor. + :type input: Tensor - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`. + :type sizes: int or list of int - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param axis: the dimension along which to split. + :type axis: int - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :param name: the name of the layer. Default is None. + :type name: string - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + :returns: list of Tensors -- the output tensors. + """ + if type(sizes) is list: + split = sizes + else: + assert input.dims[axis] % sizes == 0, "Split dimension is not divisible" + split = [input.dims[axis] // sizes for i in range(sizes)] + n = len(split) + assert n <= 256, "Please increase MAX_NUM_OUTPUTS" + c_split = ffi.new("int[]", split) + c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]") + c_name = get_c_name(name) + ffc().flexflow_model_add_split( + self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name + ) + output_tensor_list = [] + for i in range(n): + tensor_p_handle = ffi.new("flexflow_tensor_t*") + tensor_p_handle.impl = c_outputs_handle_list[i].impl + output_tensor_list.append( + Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle) + ) + self.add_layer(OpType.SPLIT, name) + del c_outputs_handle_list + return output_tensor_list - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + def flat(self, input, name=None): + """Flattens the input. Does not affect the batch size. - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param input: the input Tensor. + :type input: Tensor - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + :param name: the name of the layer. Default is None. + :type name: string - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name) + self.add_layer(OpType.FLAT, name) + return Tensor(handle, owner_op_type=OpType.FLAT) - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + def softmax(self, input, axis=-1, name=None): + """Softmax activation function. - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + :param input: the input Tensor. + :type input: Tensor :param name: the name of the layer. Default is None. :type name: string @@ -3162,107 +3107,93 @@ def spec_inc_multiquery_self_attention( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention( - self.handle, - input.handle, - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - c_name, + handle = ffc().flexflow_model_add_softmax( + self.handle, input.handle, axis, c_name ) - self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.SOFTMAX, name) + return Tensor(handle, owner_op_type=OpType.SOFTMAX) - def inc_multiquery_self_attention_verify( - self, - input, - embed_dim, - num_q_heads, - num_kv_heads, - kdim=0, - vdim=0, - dropout=0.0, - bias=True, - add_bias_kv=False, - add_zero_attn=False, - data_type=DataType.DT_NONE, - kernel_initializer=None, - apply_rotary_embedding=False, - scaling_query=False, - scaling_factor=1.0, - qk_prod_scaling=True, - position_bias=False, - name=None, - ): - """Defines the multi-query head attention, which allows a different number of Q and KV heads, - and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. - This operator only supports computing the attention in inference (tree verify) mode. + def reshape(self, input, shape, name=None): + """Layer that reshapes inputs into the given shape. + + Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order, + except with a new shape given by :attr:`shape`. :param input: the input Tensor. :type input: Tensor - :param embed_dim: total dimension of the model - :type embed_dim: int + :param shape: A list defining the shape of the output tensor. + :type shape: list of int - :param num_q_heads: Number of query attention heads. - :type num_q_heads: int + :param name: the name of the layer. Default is None. + :type name: string - :param num_kv_heads: Number of key/value attention heads. - :type num_kv_heads: int + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_shape = ffi.new("int[]", shape) + handle = ffc().flexflow_model_add_reshape( + self.handle, input.handle, len(shape), c_shape, c_name + ) + self.add_layer(OpType.RESHAPE, name) + return Tensor(handle, owner_op_type=OpType.RESHAPE) - :param kdim: total number of features in key. Default is 0 - :type kdim: int + def gather(self, input, index, dim, name=None): + """Layer that gathers values along the dim axis. - :param vdim: total number of features in value. Default is 0 - :type vdim: int + :param input: the input tensor + :type input: Tensor - :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 - :type dropout: float(0-1) + :param index: the index tensor, which specifies the indices of elements to gather + :type index: Tensor - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool + :param dim: the axis along which to index + :type dim: int - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool + :param name: the name of the layer. Default is None + :type name: string - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. - :type add_zero_attn: bool + :returns: Tensor -- the output tensor + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_gather( + self.handle, input.handle, index.handle, dim, c_name + ) + self.add_layer(OpType.GATHER, name) + return Tensor(handle, owner_op_type=OpType.GATHER) - :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. - :type data_type: DataType + def transpose(self, input, perm, name=None): + """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm - :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. - :type kernel_initializer: Initializer + :param input: the input Tensor. + :type input: Tensor - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param perm: A permutation of the dimensions of a. + :type perm: List of int - :param scaling_query: Whether to apply scaling query. Default is False. - :type scaling_query: bool + :param name: the name of the layer. Default is None. + :type name: string - :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. - :type scaling_factor: float + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + c_perm = ffi.new("int[]", perm) + handle = ffc().flexflow_model_add_transpose( + self.handle, input.handle, len(perm), c_perm, c_name + ) + self.add_layer(OpType.TRANSPOSE, name) + return Tensor(handle, owner_op_type=OpType.TRANSPOSE) - :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. - :type qk_prod_scaling: bool + def reverse(self, input, axis, name=None): + """Layer that reverses specific dimensions of a tensor. - :param position_bias: Whether to add position bias to the QK product. Default is False. - :type position_bias: bool + Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`. + + :param input: the input Tensor. + :type input: Tensor + + :param axis: the dimension to reverse. + :type axis: int :param name: the name of the layer. Default is None. :type name: string @@ -3270,43 +3201,20 @@ def inc_multiquery_self_attention_verify( :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - kernel_init_handle = self.__get_initializer_handle(kernel_initializer) - c_data_type = enum_to_int(DataType, data_type) - handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify( - self.handle, - input.handle, - embed_dim, - num_q_heads, - num_kv_heads, - kdim, - vdim, - dropout, - bias, - add_bias_kv, - add_zero_attn, - c_data_type, - kernel_init_handle, - apply_rotary_embedding, - scaling_query, - scaling_factor, - qk_prod_scaling, - position_bias, - c_name, + handle = ffc().flexflow_model_add_reverse( + self.handle, input.handle, axis, c_name ) - self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) - return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + self.add_layer(OpType.REVERSE, name) + return Tensor(handle, owner_op_type=OpType.REVERSE) - def rms_norm(self, input, eps, dim, name=None): - """Defines the RMS Norm layer. + def scalar_multiply(self, input, scalar, inplace=True, name=None): + """Scalar multiplication of a tensor by an scalar. :param input: the input Tensor. :type input: Tensor - :param eps: a value added to the denominator for numerical stability - :type eps: float - - :param dim: The dimension with respect to which to take the norm - :type dim: int + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3314,26 +3222,20 @@ def rms_norm(self, input, eps, dim, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_rms_norm( - self.handle, input.handle, eps, dim, c_name + handle = ffc().flexflow_model_add_scalar_multiply( + self.handle, input.handle, scalar, inplace, c_name ) - self.add_layer(OpType.RMS_NORM, name) - return Tensor(handle, owner_op_type=OpType.RMS_NORM) - - def residual_rms_norm(self, input1, input2, eps, dim, name=None): - """Defines the Residual RMS Norm layer. + self.add_layer(OpType.SCALAR_MULTIPLY, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY) - :param input: the input 1 Tensor. - :type input: Tensor + def scalar_add(self, input, scalar, inplace=True, name=None): + """Scalar addition of a scalar to each entry of a tensor. - :param input: the input 2 Tensor. + :param input: the input Tensor. :type input: Tensor - :param eps: a value added to the denominator for numerical stability - :type eps: float - - :param dim: The dimension with respect to which to take the norm - :type dim: int + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3341,28 +3243,20 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handles_array = ffc().flexflow_model_add_residual_rms_norm( - self.handle, input1.handle, input2.handle, eps, dim, c_name - ) - self.add_layer(OpType.RESIDUAL_RMS_NORM, name) - return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor( - handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM + handle = ffc().flexflow_model_add_scalar_add( + self.handle, input.handle, scalar, inplace, c_name ) + self.add_layer(OpType.SCALAR_ADD, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_ADD) - def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): - """Defines the Arg TopK layer. + def scalar_sub(self, input, scalar, inplace=True, name=None): + """Scalar subtraction of a scalar to each entry of a tensor. :param input: the input Tensor. :type input: Tensor - :param k: the top k indices to select - :type k: int - - :param sorted: Whether the entries should be sorted - :type sorted: bool - - :param speculative_decoding: Whether you need to perform beam search - :type speculative_decoding: bool + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3370,23 +3264,20 @@ def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_arg_top_k( - self.handle, input.handle, k, sorted, c_name + handle = ffc().flexflow_model_add_scalar_sub( + self.handle, input.handle, scalar, inplace, c_name ) - self.add_layer(OpType.ARG_TOPK, name) - return Tensor(handle, owner_op_type=OpType.ARG_TOPK) + self.add_layer(OpType.SCALAR_SUB, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_SUB) - def beam_top_k(self, input, max_beam_size, sorted, name=None): - """Defines the Beam TopK layer. + def scalar_true_divide(self, input, scalar, inplace=True, name=None): + """Scalar regular division of a tensor by an scalar. :param input: the input Tensor. :type input: Tensor - :param max_beam_size: the top max_beam_size indices to select - :type max_beam_size: int - - :param sorted: Whether the entries should be sorted - :type sorted: bool + :param input: the scalar + :type scalar: float :param name: the name of the layer. Default is None. :type name: string @@ -3394,889 +3285,1498 @@ def beam_top_k(self, input, max_beam_size, sorted, name=None): :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_beam_top_k( - self.handle, input.handle, max_beam_size, sorted, c_name + handle = ffc().flexflow_model_add_scalar_truediv( + self.handle, input.handle, scalar, inplace, c_name ) - self.add_layer(OpType.BEAM_TOPK, name) - return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) + self.add_layer(OpType.SCALAR_TRUEDIV, name) + return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV) - def sampling(self, input, top_p, name=None): - """Defines the Sampling layer. + def gelu(self, input, inplace=True, name=None): + """Gaussian Error Linear Unit activation function. :param input: the input Tensor. :type input: Tensor - :param top_p: The top_p parameter of the sampling - :type top_p: float - :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_sampling( - self.handle, input.handle, top_p, c_name - ) - self.add_layer(OpType.SAMPLING, name) - return Tensor(handle, owner_op_type=OpType.SAMPLING) + handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name) + self.add_layer(OpType.GELU, name) + return Tensor(handle, owner_op_type=OpType.GELU) - def argmax(self, input, beam_search, name=None): - """Defines the Sampling layer. + def relu(self, input, inplace=True, name=None): + """Rectified Linear Unit activation function. :param input: the input Tensor. :type input: Tensor - :param beam_search: Whether you need to perform beam search - :type beam_search: bool - :param name: the name of the layer. Default is None. :type name: string :returns: Tensor -- the output tensor. """ c_name = get_c_name(name) - handle = ffc().flexflow_model_add_argmax( - self.handle, input.handle, beam_search, c_name + handle = ffc().flexflow_model_add_relu( + self.handle, input.handle, inplace, c_name ) - self.add_layer(OpType.ARGMAX, name) - return Tensor(handle, owner_op_type=OpType.ARGMAX) + self.add_layer(OpType.RELU, name) + return Tensor(handle, owner_op_type=OpType.RELU) - def reset_metrics(self): - """Reset performance metrics. + def identity(self, input, name=None): + """Identity function. - :returns: None -- no returns. - """ - ffc().flexflow_model_reset_metrics(self.handle) + :param input: the input Tensor. + :type input: Tensor - def init_layers(self): - """Initialize layers. + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - ffc().flexflow_model_init_layers(self.handle) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name) + self.add_layer(OpType.IDENTITY, name) + return Tensor(handle, owner_op_type=OpType.IDENTITY) - def prefetch(self): - ffc().flexflow_model_prefetch(self.handle) + def sigmoid(self, input, name=None): + """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`. - def forward(self, seq_length=None): - """Forward propagation of all layers. + :param input: the input Tensor. + :type input: Tensor - :returns: None -- no returns. + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. """ - if seq_length is None: - seq_length = -1 - ffc().flexflow_model_forward(self.handle, seq_length) + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name) + self.add_layer(OpType.SIGMOID, name) + return Tensor(handle, owner_op_type=OpType.SIGMOID) - # TODO: seperate compute_metrics from backward - def backward(self, seq_length=None): - """Backward propagation of all layers. + def tanh(self, input, name=None): + """Hyperbolic tangent activation function. - :returns: None -- no returns. - """ - if seq_length is None: - seq_length = -1 - ffc().flexflow_model_backward(self.handle, seq_length) + :param input: the input Tensor. + :type input: Tensor - def compute_metrics(self): - """Compute performance metrics. + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - ffc().flexflow_model_compute_metrics(self.handle) - - def update(self): - """Update weights and biases of all layers. - - :returns: None -- no returns. - """ - ffc().flexflow_model_update(self.handle) - - def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): - """Configure the model for trainting. FlexFlow uses lazy initialization, - so the actual creating of all operations (including creating and partitioning - of weight, bias and output tensors) happen during compile. - - :param optimizer: optimizer instance. - :type optimizer: Optimizer + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name) + self.add_layer(OpType.TANH, name) + return Tensor(handle, owner_op_type=OpType.TANH) - :param loss_type: Enum of LossType. - Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, - LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE. - :type loss_type: LossType + def elu(self, input, inplace=True, name=None): + """Exponential Linear Unit. activation function. - :param metrics: List of metrics to be evaluated by the model during training and testing. - Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY, - METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, - METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR - :type metrics: MetricsType + :param input: the input Tensor. + :type input: Tensor - :param comp_mode: Enum of CompMode. - Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE - :type comp_mode: CompMode + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - self.optimizer = optimizer - - c_loss_type = enum_to_int(LossType, loss_type) - metrics_int = [] - for metric in metrics: - metrics_int.append(enum_to_int(MetricsType, metric)) - c_metrics = ffi.new("int[]", metrics_int) - if comp_mode == None: - comp_mode = CompMode.TRAINING - c_comp_mode = enum_to_int(CompMode, comp_mode) - ffc().flexflow_model_compile( - self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_elu( + self.handle, input.handle, inplace, c_name ) - for ff_tensor, np_tensor in self.attr_tensors.items(): - ff_tensor.set_tensor(self, np_tensor) - print("Compiled ffmodel!") - - def fit(self, x=None, y=None, batch_size=None, epochs=1): - """Trains the model for a fixed number of epochs (iterations on a dataset). - - :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. - :type x: Dataloader - - :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. - :type y: Dataloader - - :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` - or :attr:`--batch-size` from the command line. - :type batch_size: int - - :param epochs: Number of epochs to train the model. - An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. - The default value is 1. - :type epochs: int - - :returns: None -- no returns. - """ - if isinstance(x, list) == False: - dataloaders = [x] - else: - dataloaders = x - dataloaders.append(y) - - num_samples = y.num_samples - batch_size = self._ffconfig.batch_size - self._tracing_id += 1 # get a new tracing id - for epoch in range(0, epochs): - for d in dataloaders: - d.reset() - self.reset_metrics() - iterations = num_samples / batch_size - for iter in range(0, int(iterations)): - self._ffconfig.begin_trace(self._tracing_id) - for d in dataloaders: - d.next_batch(self) - self.forward() - self.zero_gradients() - self.backward() - self.update() - self._ffconfig.end_trace(self._tracing_id) + self.add_layer(OpType.ELU, name) + return Tensor(handle, owner_op_type=OpType.ELU) - def eval(self, x=None, y=None, batch_size=None): - """Returns the loss value & metrics values for the model in test mode. + def dropout(self, input, rate, seed, name=None): + """The Dropout layer randomly sets input units to 0 with + a frequency of :attr:`rate` at each step during training time, + which helps prevent overfitting. + Inputs not set to 0 are scaled up by 1/(1 - rate) such that the + sum over all inputs is unchanged. - :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances. - :type x: Dataloader + :param input: the input Tensor. + :type input: Tensor - :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances. - :type y: Dataloader + :param rate: Fraction of the input units to drop. + :type rate: float(0-1) - :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b` - or :attr:`--batch-size` from the command line. - :type batch_size: int + :param seed: random seed. + :type seed: int - :param epochs: Number of epochs to train the model. - An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided. - The default value is 1. - :type epochs: int + :param name: the name of the layer. Default is None. + :type name: string - :returns: None -- no returns. + :returns: Tensor -- the output tensor. """ - if isinstance(x, list) == False: - dataloaders = [x] - else: - dataloaders = x - dataloaders.append(y) - - num_samples = y.num_samples - batch_size = self._ffconfig.batch_size - for d in dataloaders: - d.reset() - self.reset_metrics() - iterations = num_samples / batch_size - self._tracing_id += 1 # get a new tracing id - for iter in range(0, int(iterations)): - for d in dataloaders: - d.next_batch(self) - self._ffconfig.begin_trace(self._tracing_id) - self.forward() - self.compute_metrics() - self._ffconfig.end_trace(self._tracing_id) - - def zero_gradients(self): - """Empty the gradients of all layers. + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_dropout( + self.handle, input.handle, rate, seed, c_name + ) + self.add_layer(OpType.DROPOUT, name) + return Tensor(handle, owner_op_type=OpType.DROPOUT) - :returns: None -- no returns. - """ - ffc().flexflow_model_zero_gradients(self.handle) + def multihead_attention( + self, + query, + key, + value, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + kernel_initializer=None, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`, + and returns the dot-product attention between them:. - def set_optimizer(self, optimizer): - if isinstance(optimizer, SGDOptimizer) == True: - ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle) - elif isinstance(optimizer, AdamOptimizer) == True: - ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle) - elif optimizer == None: - pass - else: - assert 0, "[Model]: unknown optimizer" + :param query: the query Tensor. + :type query: Tensor - optimizer = property(fset=set_optimizer) + :param key: the key Tensor. + :type key: Tensor - def print_layers(self, id=-1): - ffc().flexflow_model_print_layers(self.handle, id) + :param value: the value Tensor. + :type value: Tensor - def get_layer_by_id(self, layer_id): - return self._layers[layer_id] + :param embed_dim: total dimension of the model + :type embed_dim: int - def get_last_layer(self): - return self._layers[self._nb_layers - 1] + :param num_heads: Number of attention heads. + :type num_heads: int - def get_layer_by_name(self, layer_name): - for layer_id in self._layers: - layer = self._layers[layer_id] - if layer.name == layer_name: - return layer - assert 0, f"Cannot find the layer with name {layer_name}" - return None + :param kdim: total number of features in key. Default is 0 + :type kdim: int - def get_tensor_by_id(self, id): - handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id) - return Parameter(handle) + :param vdim: total number of features in value. Default is 0 + :type vdim: int - @property - def label_tensor(self): - handle = ffc().flexflow_model_get_label_tensor(self.handle) - return Tensor(handle, deallocate=False) + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) - def get_perf_metrics(self): - handle = ffc().flexflow_model_get_perf_metrics(self.handle) - return PerfMetrics(handle) + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool - def set_transformer_layer_id(self, id): - ffc().flexflow_model_set_transformer_layer_id(self.handle, id) + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool - def create_data_loader(self, batch_tensor, full_array): - """Create a SingleDataloader instance. + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool - :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model. - :type batch_tensor: Tensor + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer - :param full_array: the entire data. - :type full_array: Numpy Array + :param name: the name of the layer. Default is None. + :type name: string - :returns: SingleDataloader -- returns a dataloader instance. + :returns: Tensor -- the output tensor. """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + handle = ffc().flexflow_model_add_multihead_attention( + self.handle, + query.handle, + key.handle, + value.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + kernel_init_handle, + c_name, + ) + self.add_layer(OpType.MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION) - if self._ffconfig.enable_control_replication: - assert ( - self._ffconfig.python_data_loader_type != 1 - ), "To enable control replication, please set --python-data-loader-type 2" - return self.__create_data_loader_ptr(batch_tensor, full_array) - else: - if self._ffconfig.python_data_loader_type == 1: - return self.__create_data_loader_attach(batch_tensor, full_array) - else: - return self.__create_data_loader_ptr(batch_tensor, full_array) - - def __create_data_loader_attach(self, batch_tensor, full_array): - full_array_shape = full_array.shape - num_samples = full_array_shape[0] - num_dim = len(full_array_shape) - if full_array.dtype == "float16": - datatype = DataType.DT_HALF - elif full_array.dtype == "float32": - datatype = DataType.DT_FLOAT - elif full_array.dtype == "int32": - datatype = DataType.DT_INT32 - elif full_array.dtype == "int64": - datatype = DataType.DT_INT64 - else: - assert 0, "unsupported datatype" + def inc_multihead_self_attention( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multihead_self_attention( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multihead_self_attention( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multihead_self_attention_verify( + self, + input, + embed_dim, + num_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the MultiHead Attention operation as described in Attention Is All You Need + which takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_heads: Number of attention heads. + :type num_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify( + self.handle, + input.handle, + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + + def inc_multiquery_self_attention( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + In inference mode, the attention is computed using incremental decoding. + + :param input: the input Tensor. + :type input: Tensor + + :param embed_dim: total dimension of the model + :type embed_dim: int + + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int + + :param kdim: total number of features in key. Default is 0 + :type kdim: int + + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool + + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer + + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float + + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, + ) + self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION) + + def spec_inc_multiquery_self_attention( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (beam search) mode. - if num_dim == 2: - full_tensor = self.create_tensor( - [num_samples, full_array_shape[1]], datatype - ) - self.map_tensor(full_tensor) - elif num_dim == 4: - full_tensor = self.create_tensor( - [ - num_samples, - full_array_shape[1], - full_array_shape[2], - full_array_shape[3], - ], - datatype, - ) - self.map_tensor(full_tensor) - else: - assert 0, "unsupported dims" + :param input: the input Tensor. + :type input: Tensor - full_tensor.attach_numpy_array(self._ffconfig, full_array) - dataloader = SingleDataLoader( - self, batch_tensor, full_tensor, num_samples, datatype - ) - full_tensor.detach_numpy_array(self._ffconfig) + :param embed_dim: total dimension of the model + :type embed_dim: int - return dataloader + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int - def __create_data_loader_ptr(self, batch_tensor, full_array): - full_array_shape = full_array.shape - num_samples = full_array_shape[0] - if full_array.dtype == "float16": - datatype = DataType.DT_HALF - elif full_array.dtype == "float32": - datatype = DataType.DT_FLOAT - elif full_array.dtype == "int32": - datatype = DataType.DT_INT32 - elif full_array.dtype == "int64": - datatype = DataType.DT_INT64 - else: - assert 0, "unsupported datatype" - np_raw_ptr = full_array.__array_interface__["data"] - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - print( - "numpy array: %s, %s, %s" - % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0])) - ) - dataloader = SingleDataLoader( - self, batch_tensor, raw_ptr, num_samples, datatype - ) + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int - return dataloader + :param kdim: total number of features in key. Default is 0 + :type kdim: int - def __get_initializer_handle(self, initializer): - if initializer == None: - null_initializer = Initializer(None) - return null_initializer.handle - else: - return initializer.handle + :param vdim: total number of features in value. Default is 0 + :type vdim: int - def __get_op_handle(self, shared_op): - if shared_op == None: - op_handle = ffi.new("flexflow_op_t *") - op_handle.impl = ffi.NULL - op = Op(op_handle[0]) - else: - op = shared_op - return op.handle + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) - def get_output_tensor(self, ffmodel, data_type): - shape = self.dims - if data_type == DataType.DT_HALF: - np_array = np.empty(shape, dtype=np.float16) - elif data_type == DataType.DT_FLOAT: - np_array = np.empty(shape, dtype=np.float32) - elif self.data_type == DataType.DT_INT32: - np_array = np.empty(shape, dtype=np.int32) - elif self.data_type == DataType.DT_INT64: - np_array = np.empty(shape, dtype=np.int64) - else: - assert 0, f"Unsupported datatype: {self.data_type}" - np_raw_ptr = np_array.__array_interface__["data"] - if np_array.dtype == np.float32: - raw_ptr = ffi.cast("float*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_float( - self.handle, ffmodel.handle, raw_ptr, False - ) - elif np_array.dtype == np.int32: - raw_ptr = ffi.cast("int*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int( - self.handle, ffmodel.handle, raw_ptr, False - ) - elif np_array.dtype == np.int64: - raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0]) - ret_val = ffc().flexflow_tensor_get_tensor_int64( - self.handle, ffmodel.handle, raw_ptr, False - ) - fflogger.debug( - "get weights raw_ptr: %s, %s, %s, %s" - % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape)) - ) - assert ret_val == True - return np_array + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool - def generate(self, prompt_list, max_sequence_length): - assert isinstance(prompt_list, list) - c_input_texts = [get_c_name(prompt) for prompt in prompt_list] - max_num_chars = 5 * (max_sequence_length + 100) - c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] - c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list] - ffc().flexflow_model_generate( - self.handle, - len(prompt_list), - c_input_texts, - max_num_chars, - c_output_texts, - max_sequence_length, - c_output_length_and_tokens, - ) - #output_length = c_output_length_and_tokens[0] - #output_tokens = [] - #for i in range(output_length): - # output_tokens.append(c_output_length_and_tokens[i + 1]) - from flexflow.serve import GenerationResult + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool - return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts] + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool - def set_position_offset(self, offset): - ffc().flexflow_model_set_position_offset(self.handle, offset) + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer -# ----------------------------------------------------------------------- -# SGDOptimizer -# ----------------------------------------------------------------------- + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool -class SGDOptimizer(object): - __slots__ = ["handle", "_handle"] + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float - def __init__( - self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0 - ): - self.handle = ffc().flexflow_sgd_optimizer_create( - ffmodel.handle, lr, momentum, nesterov, weight_decay + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool + + :param name: the name of the layer. Default is None. + :type name: string + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, ) - self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy) + self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION) - def set_learning_rate(self, learning_rate): - ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate) + def inc_multiquery_self_attention_verify( + self, + input, + embed_dim, + num_q_heads, + num_kv_heads, + kdim=0, + vdim=0, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + data_type=DataType.DT_NONE, + kernel_initializer=None, + apply_rotary_embedding=False, + scaling_query=False, + scaling_factor=1.0, + qk_prod_scaling=True, + position_bias=False, + name=None, + ): + """Defines the multi-query head attention, which allows a different number of Q and KV heads, + and takes in the tensors :attr:`input`, and uses it for all three of query, key and values. + This operator only supports computing the attention in inference (tree verify) mode. + + :param input: the input Tensor. + :type input: Tensor + :param embed_dim: total dimension of the model + :type embed_dim: int -# ----------------------------------------------------------------------- -# AdamOptimizer -# ----------------------------------------------------------------------- + :param num_q_heads: Number of query attention heads. + :type num_q_heads: int + :param num_kv_heads: Number of key/value attention heads. + :type num_kv_heads: int -class AdamOptimizer(object): - __slots__ = ["handle", "_handle"] + :param kdim: total number of features in key. Default is 0 + :type kdim: int - def __init__( - self, - ffmodel, - alpha=0.001, - beta1=0.9, - beta2=0.999, - weight_decay=0.0, - epsilon=1e-8, - ): - self.handle = ffc().flexflow_adam_optimizer_create( - ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon - ) - self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy) + :param vdim: total number of features in value. Default is 0 + :type vdim: int + + :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 + :type dropout: float(0-1) + + :param bias: Whether the dense layers use bias vectors. Default is True. + :type bias: bool + + :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. + :type add_bias_kv: bool + + :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. + :type add_zero_attn: bool - def set_learning_rate(self, learning_rate): - ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate) + :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors. + :type data_type: DataType + :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. + :type kernel_initializer: Initializer -# ----------------------------------------------------------------------- -# Initializer -# ----------------------------------------------------------------------- -class Initializer(object): - __slots__ = ["handle", "p_handle"] + :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. + :type apply_rotary_embedding: bool - def __init__(self, handle, p_handle=0): - self.p_handle = ffi.new("flexflow_initializer_t *") - if handle == None: - self.p_handle.impl = ffi.NULL - else: - self.p_handle.impl = handle.impl - self.handle = self.p_handle[0] - assert ffi.typeof(self.handle) == ffi.typeof( - "flexflow_initializer_t" - ), "Initializer handle is wrong" + :param scaling_query: Whether to apply scaling query. Default is False. + :type scaling_query: bool + :param scaling_factor: The scaling factor to use for scaling. Default is 1.0. + :type scaling_factor: float -# ----------------------------------------------------------------------- -# GlorotUniform -# ----------------------------------------------------------------------- + :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True. + :type qk_prod_scaling: bool + :param position_bias: Whether to add position bias to the QK product. Default is False. + :type position_bias: bool -class GlorotUniformInitializer(Initializer): - __slots__ = ["glorot_handle", "_glorot_handle"] + :param name: the name of the layer. Default is None. + :type name: string - def __init__(self, seed): - self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed) - self._glorot_handle = ffi.gc( - self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + kernel_init_handle = self.__get_initializer_handle(kernel_initializer) + c_data_type = enum_to_int(DataType, data_type) + handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify( + self.handle, + input.handle, + embed_dim, + num_q_heads, + num_kv_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + c_data_type, + kernel_init_handle, + apply_rotary_embedding, + scaling_query, + scaling_factor, + qk_prod_scaling, + position_bias, + c_name, ) - super(GlorotUniformInitializer, self).__init__(self.glorot_handle) + self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name) + return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION) + def rms_norm(self, input, eps, dim, name=None): + """Defines the RMS Norm layer. -# ----------------------------------------------------------------------- -# ZeroInitializer -# ----------------------------------------------------------------------- + :param input: the input Tensor. + :type input: Tensor + + :param eps: a value added to the denominator for numerical stability + :type eps: float + :param dim: The dimension with respect to which to take the norm + :type dim: int -class ZeroInitializer(Initializer): - __slots__ = ["zero_handle", "_zero_handle"] + :param name: the name of the layer. Default is None. + :type name: string - def __init__(self): - self.zero_handle = ffc().flexflow_zero_initializer_create() - self._zero_handle = ffi.gc( - self.zero_handle, ffc().flexflow_zero_initializer_destroy + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_rms_norm( + self.handle, input.handle, eps, dim, c_name ) - super(ZeroInitializer, self).__init__(self.zero_handle) + self.add_layer(OpType.RMS_NORM, name) + return Tensor(handle, owner_op_type=OpType.RMS_NORM) + def residual_rms_norm( + self, input1, input2, eps, dim, inplace_residual=False, name=None + ): + """Defines the Residual RMS Norm layer. -# ----------------------------------------------------------------------- -# UniformInitializer -# ----------------------------------------------------------------------- + :param input: the input 1 Tensor. + :type input: Tensor + :param input: the input 2 Tensor. + :type input: Tensor -class UniformInitializer(Initializer): - __slots__ = ["uniform_handle", "_uniform_handle"] + :param eps: a value added to the denominator for numerical stability + :type eps: float - def __init__(self, seed, minv, maxv): - self.uniform_handle = ffc().flexflow_uniform_initializer_create( - seed, minv, maxv + :param dim: The dimension with respect to which to take the norm + :type dim: int + + :param name: the name of the layer. Default is None. + :type name: string + + :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False. + :type inplace_residual: bool + + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handles_array = ffc().flexflow_model_add_residual_rms_norm( + self.handle, + input1.handle, + input2.handle, + eps, + dim, + inplace_residual, + c_name, ) - self._uniform_handle = ffi.gc( - self.uniform_handle, ffc().flexflow_uniform_initializer_destroy + self.add_layer(OpType.RESIDUAL_RMS_NORM, name) + return ( + Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), + Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM), ) - super(UniformInitializer, self).__init__(self.uniform_handle) + def arg_top_k(self, input, k, sorted, speculative_decoding, name=None): + """Defines the Arg TopK layer. -# ----------------------------------------------------------------------- -# NormInitializer -# ----------------------------------------------------------------------- + :param input: the input Tensor. + :type input: Tensor + :param k: the top k indices to select + :type k: int -class NormInitializer(Initializer): - __slots__ = ["norm_handle", "_norm_handle"] + :param sorted: Whether the entries should be sorted + :type sorted: bool - def __init__(self, seed, mean, stddev): - self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev) - self._norm_handle = ffi.gc( - self.norm_handle, ffc().flexflow_norm_initializer_destroy - ) - super(NormInitializer, self).__init__(self.norm_handle) + :param speculative_decoding: Whether you need to perform beam search + :type speculative_decoding: bool + :param name: the name of the layer. Default is None. + :type name: string -# ----------------------------------------------------------------------- -# PerfMetrics -# ----------------------------------------------------------------------- + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_arg_top_k( + self.handle, input.handle, k, sorted, c_name + ) + self.add_layer(OpType.ARG_TOPK, name) + return Tensor(handle, owner_op_type=OpType.ARG_TOPK) + def beam_top_k(self, input, max_beam_size, sorted, name=None): + """Defines the Beam TopK layer. -class PerfMetrics(object): - __slots__ = ["handle", "_handle"] + :param input: the input Tensor. + :type input: Tensor - def __init__(self, handle): - self.handle = handle - self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy) + :param max_beam_size: the top max_beam_size indices to select + :type max_beam_size: int - def get_accuracy(self): - return ffc().flexflow_per_metrics_get_accuracy(self.handle) + :param sorted: Whether the entries should be sorted + :type sorted: bool + :param name: the name of the layer. Default is None. + :type name: string -# ----------------------------------------------------------------------- -# NetConfig -# ----------------------------------------------------------------------- + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_beam_top_k( + self.handle, input.handle, max_beam_size, sorted, c_name + ) + self.add_layer(OpType.BEAM_TOPK, name) + return Tensor(handle, owner_op_type=OpType.BEAM_TOPK) + def sampling(self, input, top_p, name=None): + """Defines the Sampling layer. -class NetConfig(object): - def __init__(self): - self.handle = ffc().flexflow_net_config_create() - self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy) - cpath = ffc().flexflow_net_config_get_dataset_path(self.handle) - self.dataset_path = ffi.string(cpath) + :param input: the input Tensor. + :type input: Tensor + :param top_p: The top_p parameter of the sampling + :type top_p: float -# ----------------------------------------------------------------------- -# DLRMConfig -# ----------------------------------------------------------------------- + :param name: the name of the layer. Default is None. + :type name: string + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_sampling( + self.handle, input.handle, top_p, c_name + ) + self.add_layer(OpType.SAMPLING, name) + return Tensor(handle, owner_op_type=OpType.SAMPLING) -class DLRMConfig(object): - def __init__(self): - self.handle = ffc().flexflow_dlrm_config_create() - self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy) + def argmax(self, input, beam_search, name=None): + """Defines the Sampling layer. - cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle) - self.dataset_path = ffi.string(cstr) + :param input: the input Tensor. + :type input: Tensor - cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle) - self.arch_interaction_op = ffi.string(cstr) + :param beam_search: Whether you need to perform beam search + :type beam_search: bool - self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size( - self.handle - ) - self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle) - self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle) - self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size( - self.handle - ) - self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle) + :param name: the name of the layer. Default is None. + :type name: string - mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle) - self.mlp_bot = [] - for i in range(0, mlp_bot_c[0]): - self.mlp_bot.append(mlp_bot_c[i + 1]) + :returns: Tensor -- the output tensor. + """ + c_name = get_c_name(name) + handle = ffc().flexflow_model_add_argmax( + self.handle, input.handle, beam_search, c_name + ) + self.add_layer(OpType.ARGMAX, name) + return Tensor(handle, owner_op_type=OpType.ARGMAX) - mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle) - self.mlp_top = [] - for i in range(0, mlp_top_c[0]): - self.mlp_top.append(mlp_top_c[i + 1]) + def add_lora_layer(self, peft_config): + return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle) - embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle) - self.embedding_size = [] - for i in range(0, embedding_size_c[0]): - self.embedding_size.append(embedding_size_c[i + 1]) + def reset_metrics(self): + """Reset performance metrics. + :returns: None -- no returns. + """ + ffc().flexflow_model_reset_metrics(self.handle) -# ----------------------------------------------------------------------- -# Single DataLoader -# ----------------------------------------------------------------------- + def init_layers(self): + """Initialize layers. + :returns: None -- no returns. + """ + ffc().flexflow_model_init_layers(self.handle) -class SingleDataLoader(object): - __slots__ = ["handle", "_handle"] + def prefetch(self): + ffc().flexflow_model_prefetch(self.handle) - def __init__(self, ffmodel, input, full_input, num_samples, data_type): - assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong" - assert type(input) is Tensor, "SingleDataLoader input is wrong" - if type(full_input) is Tensor: - self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type) - else: - self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type) - self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy) + def forward(self, seq_length=None): + """Forward propagation of all layers. - def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type): - assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" - c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc().flexflow_single_dataloader_create( - ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type - ) + :returns: None -- no returns. + """ + if seq_length is None: + seq_length = -1 + ffc().flexflow_model_forward(self.handle, seq_length) - def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type): - # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong" - c_data_type = enum_to_int(DataType, data_type) - self.handle = ffc().flexflow_single_dataloader_create2( - ffmodel.handle, input.handle, full_input, num_samples, c_data_type - ) + # TODO: seperate compute_metrics from backward + def backward(self, seq_length=None): + """Backward propagation of all layers. - @property - def num_samples(self): - return ffc().flexflow_single_dataloader_get_num_samples(self.handle) + :returns: None -- no returns. + """ + if seq_length is None: + seq_length = -1 + ffc().flexflow_model_backward(self.handle, seq_length) - @num_samples.setter - def num_samples(self, samples): - ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples) + def compute_metrics(self): + """Compute performance metrics. - def next_batch(self, ffmodel): - """Ask the dataloder to load the next batch to the :attr:`batch_tensor`. + :returns: None -- no returns. + """ + ffc().flexflow_model_compute_metrics(self.handle) + + def update(self): + """Update weights and biases of all layers. :returns: None -- no returns. """ - ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle) + ffc().flexflow_model_update(self.handle) - def reset(self): - """Reset the current position of the dataloder to 0. + def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None): + """Configure the model for trainting. FlexFlow uses lazy initialization, + so the actual creating of all operations (including creating and partitioning + of weight, bias and output tensors) happen during compile. + + :param optimizer: optimizer instance. + :type optimizer: Optimizer + + :param loss_type: Enum of LossType. + Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, + LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE. + :type loss_type: LossType + + :param metrics: List of metrics to be evaluated by the model during training and testing. + Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY, + METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY, + METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR + :type metrics: MetricsType + + :param comp_mode: Enum of CompMode. + Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE + :type comp_mode: CompMode :returns: None -- no returns. """ - ffc().flexflow_single_dataloader_reset(self.handle) + self.optimizer = optimizer + c_loss_type = enum_to_int(LossType, loss_type) + metrics_int = [] + for metric in metrics: + metrics_int.append(enum_to_int(MetricsType, metric)) + c_metrics = ffi.new("int[]", metrics_int) + if comp_mode == None: + comp_mode = CompMode.TRAINING + c_comp_mode = enum_to_int(CompMode, comp_mode) + ffc().flexflow_model_compile( + self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode + ) + for ff_tensor, np_tensor in self.attr_tensors.items(): + ff_tensor.set_tensor(self, np_tensor) + print("Compiled ffmodel!") -class RegionNdarray(object): - __slots__ = ["__array_interface__"] + def fit(self, x=None, y=None, batch_size=None, epochs=1): + """Trains the model for a fixed number of epochs (iterations on a dataset). - def __init__(self, shape, data_type, base_ptr, strides, read_only): - # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html - if data_type == DataType.DT_HALF: - field_type = " 0: + finetuning_losses = [ + c_finetuning_losses[i] for i in range(num_finetuning_losses[0]) + ] + results = [] + for c_output_text in c_output_texts: + results.append( + GenerationResult( + text=( + ffi.string(c_output_text) if c_output_text != ffi.NULL else None + ), + tokens=[], + finetuning_losses=finetuning_losses, + ) + ) + return results + + def set_position_offset(self, offset): + ffc().flexflow_model_set_position_offset(self.handle, offset) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 5af077273d..fd29080a6a 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -15,7 +15,16 @@ from typing import Optional from ..type import * from flexflow.core import * -from .serve import LLM, SSM, GenerationConfig, GenerationResult +from .serve import ( + LLM, + SSM, + GenerationConfig, + GenerationResult, + LoraLinearConfig, + PEFTModelID, + Request, + RequestType, +) def __check_positive_int(configs_dict: dict, key: str): @@ -44,6 +53,9 @@ def init( offload_reserve_space_size: Optional[int] = None, use_4bit_quantization: Optional[bool] = None, use_8bit_quantization: Optional[bool] = None, + enable_peft: Optional[bool] = None, + peft_activation_reserve_space_size: Optional[int] = None, + peft_weight_reserve_space_size: Optional[int] = None, profiling: Optional[bool] = None, benchmarking: Optional[bool] = None, inference_debugging: Optional[bool] = None, @@ -69,9 +81,12 @@ def init( - tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1 - pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1 - offload: whether to enable offloading of the weights to CPU, defaults to False - - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2 + - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB - use_4bit_quantization: whether to use 4-bit quantization, defaults to False - use_8bit_quantization: whether to use 8-bit quantization, defaults to False + - enable_peft: whether to enable the use of PEFT, defaults to False + - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB + - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB - profiling: whether to enable the FlexFlow profiling mode, defaults to False - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False @@ -100,12 +115,18 @@ def init( :type pipeline_parallelism_degree: Optional[int], optional :param offload: whether to enable offloading of the weights to CPU, defaults to False :type offload: Optional[bool], optional - :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2 + :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB :type offload_reserve_space_size: Optional[int], optional :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False :type use_4bit_quantization: Optional[bool], optional :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False :type use_8bit_quantization: Optional[bool], optional + :param enable_peft: whether to enable the use of PEFT, defaults to False + :type enable_peft: Optional[bool], optional + :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB + :type peft_activation_reserve_space_size: Optional[int], optional + :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB + :type peft_weight_reserve_space_size: Optional[int], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False @@ -135,6 +156,9 @@ def init( offload_reserve_space_size is not None, use_4bit_quantization is not None, use_8bit_quantization is not None, + enable_peft is not None, + peft_activation_reserve_space_size is not None, + peft_weight_reserve_space_size is not None, profiling is not None, benchmarking is not None, inference_debugging is not None, @@ -161,6 +185,9 @@ def init( "offload_reserve_space_size": offload_reserve_space_size, "use_4bit_quantization": use_4bit_quantization, "use_8bit_quantization": use_8bit_quantization, + "enable_peft": enable_peft, + "peft_activation_reserve_space_size": peft_activation_reserve_space_size, + "peft_weight_reserve_space_size": peft_weight_reserve_space_size, "profiling": profiling, "benchmarking": benchmarking, "inference_debugging": inference_debugging, @@ -182,6 +209,8 @@ def init( "tensor_parallelism_degree", "pipeline_parallelism_degree", "offload_reserve_space_size", + "peft_activation_reserve_space_size", + "peft_weight_reserve_space_size", ] for param in positive_int_params: __check_positive_int(configs_dict, param) @@ -200,11 +229,17 @@ def init( if configs_dict.get("offload", None) is None: configs_dict["offload"] = False if configs_dict.get("offload_reserve_space_size", None) is None: - configs_dict["offload_reserve_space_size"] = 1024**2 + configs_dict["offload_reserve_space_size"] = 8 * 1024**3 if configs_dict.get("use_4bit_quantization", None) is None: configs_dict["use_4bit_quantization"] = False if configs_dict.get("use_8bit_quantization", None) is None: configs_dict["use_8bit_quantization"] = False + if configs_dict.get("enable_peft", None) is None: + configs_dict["enable_peft"] = False + if configs_dict.get("peft_activation_reserve_space_size", None) is None: + configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3 + if configs_dict.get("peft_weight_reserve_space_size", None) is None: + configs_dict["peft_weight_reserve_space_size"] = 1024**3 if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False if configs_dict.get("benchmarking", None) is None: diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py index e7f3914037..17bb894250 100644 --- a/python/flexflow/serve/models/base.py +++ b/python/flexflow/serve/models/base.py @@ -32,5 +32,8 @@ def __init__( def build_model(self): assert False, "Not implemented yet" + def convert_hf_weight_name(name): + assert False, "Not implemented yet" + def convert_hf_model(model, dst_folder): assert False, "Not implemented yet" diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 7a55da26ef..0e8fbcbd7d 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -124,7 +124,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) else: token, att_norm = ffmodel.residual_layer_norm( @@ -135,7 +135,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.falcon_config.layer_norm_epsilon, - name=f"layers_{i}_input_layernorm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -153,7 +153,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -170,7 +170,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -187,7 +187,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attention", ) else: assert False @@ -197,7 +197,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size * 4, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_h_to_4h", + name=f"layers.{i}.mlp.dense_h_to_4h", ) dense_h_to_4h = ffmodel.gelu(dense_h_to_4h) mlp_output = ffmodel.dense( @@ -205,7 +205,7 @@ def build_model(self, max_tokens_per_batch): self.falcon_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_mlp_dense_4h_to_h", + name=f"layers.{i}.mlp.dense_4h_to_h", ) _, ln_f = ffmodel.residual_layer_norm( @@ -239,10 +239,18 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return (name.replace("transformer.h.", "layers.") + .replace("transformer.", "") + .replace("self_attention.dense", "self_attention.o_proj") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) n_head = ( @@ -251,17 +259,12 @@ def convert_hf_model(model, dst_folder): else model.config.num_attention_heads ) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("transformer_h_", "layers_") - .replace("transformer_", "") - .replace("self_attention_dense", "attention_wo") - ) + name = FlexFlowFalcon.convert_hf_weight_name(name) # Split Q,K,V attention weights - if "self_attention_query_key_value" in name: - name_q = name.replace("self_attention_query_key_value", "attention_wq") - name_k = name.replace("self_attention_query_key_value", "attention_wk") - name_v = name.replace("self_attention_query_key_value", "attention_wv") + if "self_attention.query_key_value" in name: + name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj") + name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj") + name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj") q, k, v = torch.split( params, [ @@ -278,5 +281,5 @@ def convert_hf_model(model, dst_folder): params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) # LM head weight model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 6b33030f62..96f0258572 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -62,7 +62,7 @@ def __init__( # self.llama_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath - self.maxint = 2**31 - 1 + self.maxint = 2 ** 31 - 1 max_verify_tokens_per_batch = ( max_tokens_per_batch + self.llama_config.max_spec_tree_token_num ) @@ -106,7 +106,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="tok_embeddings", + name="embed_tokens", ) for i in range(self.llama_config.num_hidden_layers): @@ -117,7 +117,7 @@ def build_model(self, max_tokens_per_batch): token, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) else: token, attn_norm = ffmodel.residual_rms_norm( @@ -125,7 +125,7 @@ def build_model(self, max_tokens_per_batch): w2, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_attention_norm", + name=f"layers.{i}.input_layernorm", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -145,7 +145,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( @@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( @@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer True, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) else: assert False @@ -193,21 +193,21 @@ def build_model(self, max_tokens_per_batch): mha, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, - name=f"layers_{i}_ffn_norm", + name=f"layers.{i}.post_attention_layernorm", ) w1 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w1", + name=f"layers.{i}.mlp.gate_proj", ) w3 = ffmodel.dense( ff_norm, self.llama_config.intermediate_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w3", + name=f"layers.{i}.mlp.up_proj", ) multi = ffmodel.sigmoid_silu_multi(w1, w3) w2 = ffmodel.dense( @@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch): self.llama_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_feed_forward_w2", + name=f"layers.{i}.mlp.down_proj", ) _, token = ffmodel.residual_rms_norm( @@ -230,7 +230,7 @@ def build_model(self, max_tokens_per_batch): self.llama_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="output", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -246,28 +246,16 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(dense, 1, False) - output = ffmodel.argmax(dense, False) + softmax = ffmodel.softmax(dense, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return name.replace("model.", "") + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("o_proj", "wo") - .replace("mlp", "feed_forward") - .replace("gate_proj", "w1") - .replace("down_proj", "w2") - .replace("up_proj", "w3") - .replace("input_layernorm", "attention_norm") - .replace("post_attention_layernorm", "ffn_norm") - .replace("embed_tokens", "tok_embeddings") - .replace("lm_head", "output") - .replace("model_", "") - ) + name = FlexFlowLLAMA.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index 92867fd498..b350ae106d 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -97,7 +97,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) axes = [ @@ -114,7 +114,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) else: hidden_states, layernorm_output = ffmodel.residual_layer_norm( @@ -126,7 +126,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_1", + name=f"layers.{i}.norm_1", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -148,7 +148,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: attn_outputs = ffmodel.inc_multihead_self_attention_verify( @@ -169,7 +169,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: attn_outputs = ffmodel.inc_multihead_self_attention( @@ -190,7 +190,7 @@ def build_model(self, max_tokens_per_batch): ** (-0.5), # scaling_factor False, # qk_prod_scaling True, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.attn", ) else: assert False @@ -204,7 +204,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"layers_{i}_norm_2", + name=f"layers.{i}.norm_2", ) # mlp layernorm_output = ffmodel.dense( @@ -212,7 +212,7 @@ def build_model(self, max_tokens_per_batch): 4 * self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_up_proj", + name=f"layers.{i}.ffn.up_proj", ) layernorm_output = ffmodel.gelu(layernorm_output) intermediate_output = ffmodel.dense( @@ -220,7 +220,7 @@ def build_model(self, max_tokens_per_batch): self.mpt_config.hidden_size, ActiMode.AC_MODE_NONE, False, - name=f"layers_{i}_ffn_down_proj", + name=f"layers.{i}.ffn.down_proj", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -232,7 +232,7 @@ def build_model(self, max_tokens_per_batch): True, 1e-05, False, - name=f"transformer_norm_f", + name=f"norm_f", ) lm_head = ffmodel.dense( all_final_norm, @@ -249,18 +249,27 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(dense, -1) output = ffmodel.sampling(softmax, self.generation_config.topp) else: - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + # TODO: finish this + def convert_hf_weight_name(name): + return ( + name.replace("transformer.blocks.", "layers.") + .replace("transformer.", "") + .replace("attn.out_proj", "attn.o_proj") + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.blocks.", "layers.").replace(".", "_") + name = FlexFlowMPT.convert_hf_weight_name(name) if "Wqkv" in name: - name_q = name.replace("attn_Wqkv", "attention_wq") - name_k = name.replace("attn_Wqkv", "attention_wk") - name_v = name.replace("attn_Wqkv", "attention_wv") + name_q = name.replace("attn.Wqkv", "attn.q_proj") + name_k = name.replace("attn.Wqkv", "attn.k_proj") + name_v = name.replace("attn.Wqkv", "attn.v_proj") q, k, v = torch.split( params, [ @@ -273,13 +282,10 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "out_proj" in name: - name = name.replace("attn_out_proj", "attention_wo") - params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) shutil.copy( - os.path.join(dst_folder, "transformer_wte_weight"), - os.path.join(dst_folder, "lm_head_weight"), + os.path.join(dst_folder, "wte.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index b715f5f35e..02668abf59 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_attention_layer_norm", + name=f"layers.{i}.self_attn_layer_norm", ) else: hidden_states = ffmodel.add(token, positional_embedding) @@ -163,7 +163,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multihead_self_attention_verify( @@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multihead_self_attention( @@ -203,7 +203,7 @@ def build_model(self, max_tokens_per_batch): (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor False, # qk_prod_scaling - name=f"layers_{i}_attention", + name=f"layers.{i}.self_attn", ) else: assert False @@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_add_bias_residual_layer_norm", + name=f"layers.{i}.add_bias_residual_layer_norm", ) if not self.opt_config.do_layer_norm_before: @@ -226,14 +226,14 @@ def build_model(self, max_tokens_per_batch): self.opt_config.ffn_dim, ActiMode.AC_MODE_RELU, True, - name=f"layers_{i}_fc1", + name=f"layers.{i}.fc1", ) fc2 = ffmodel.dense( fc1, self.opt_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_fc2", + name=f"layers.{i}.fc2", ) if not self.opt_config.do_layer_norm_before: @@ -245,7 +245,7 @@ def build_model(self, max_tokens_per_batch): axes, self.opt_config.layer_norm_elementwise_affine, 1e-05, - name=f"layers_{i}_final_layer_norm", + name=f"layers.{i}.final_layer_norm", ) _, all_final_norm = ffmodel.residual_layer_norm( @@ -263,7 +263,7 @@ def build_model(self, max_tokens_per_batch): self.opt_config.vocab_size, ActiMode.AC_MODE_NONE, False, - name="embed_tokens_weight_lm_head", + name="lm_head", ) if self.mode == InferenceMode.BEAM_SEARCH_MODE: @@ -279,30 +279,29 @@ def build_model(self, max_tokens_per_batch): output = ffmodel.sampling(softmax, self.generation_config.topp) else: # output = ffmodel.arg_top_k(lm_head, 1, False) - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel + def convert_hf_weight_name(name): + return ( + name.replace("decoder.", "") + .replace("model.", "") + .replace("self_attn.out_proj", "self_attn.o_proj") + .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias") + .replace( + ".final_layer_norm", ".add_bias_residual_layer_norm" + ) # important to use the leading "_" to avoid matching the last LayerNorm + ) + def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = ( - name.replace(".", "_") - .replace("decoder_", "") - .replace("model_", "") - .replace("self_attn", "attention") - .replace("q_proj", "wq") - .replace("k_proj", "wk") - .replace("v_proj", "wv") - .replace("out_proj", "wo") - .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias") - .replace( - "_final_layer_norm", "_add_bias_residual_layer_norm" - ) # important to use the leading "_" to avoid matching the last LayerNorm - ) + name = FlexFlowOPT.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") # copy embedding weights shutil.copy( - os.path.join(dst_folder, "embed_tokens_weight"), - os.path.join(dst_folder, "embed_tokens_weight_lm_head"), + os.path.join(dst_folder, "embed_tokens.weight"), + os.path.join(dst_folder, "lm_head.weight"), ) diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 37edaa4c40..2d4471201f 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -111,7 +111,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wte", + name="wte", ) positional_embedding = ffmodel.embedding( position_tensor, @@ -121,7 +121,7 @@ def build_model(self, max_tokens_per_batch): self.data_type, None, embed_init, - name="transformer_wpe", + name="wpe", ) axes = [ @@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_1", + name=f"layers.{i}.ln_1", ) assert self.mode == InferenceMode.INC_DECODING_MODE @@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch): DataType.DT_NONE, # data_type None, # kernel initializer False, # apply_rotary_embedding - name=f"layers_{i}_attention", + name=f"layers.{i}.attn.c_attn", ) residual, l2_norm = ffmodel.residual_layer_norm( @@ -171,7 +171,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"layers_{i}_ln_2", + name=f"layers.{i}.ln_2", ) # mlp @@ -181,7 +181,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.intermediate_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_fc", + name=f"layers.{i}.mlp.c_fc", ) activation = ffmodel.gelu(c_fc, False) c_proj = ffmodel.dense( @@ -189,7 +189,7 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.hidden_size, ActiMode.AC_MODE_NONE, True, - name=f"layers_{i}_mlp_c_proj", + name=f"layers.{i}.mlp.c_proj", ) _, ln_f = ffmodel.residual_layer_norm( @@ -200,7 +200,7 @@ def build_model(self, max_tokens_per_batch): axes, True, self.starcoder_config.layer_norm_epsilon, - name=f"transformer_ln_f", + name=f"ln_f", ) lm_head = ffmodel.dense( ln_f, @@ -217,18 +217,19 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(dense, -1) output = ffmodel.sampling(softmax, self.generation_config.topp) else: - output = ffmodel.argmax(lm_head, False) + softmax = ffmodel.softmax(lm_head, -1) + output = ffmodel.argmax(softmax, False) self.ffmodel = ffmodel def convert_hf_model(model, dst_folder): os.makedirs(dst_folder, exist_ok=True) for name, params in model.named_parameters(): - name = name.replace("transformer.h", "layers").replace(".", "_") - if "c_attn_weight" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + name = name.replace("transformer.h", "layers").replace("transformer.", "") + if "attn.c_attn.weight" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -241,10 +242,10 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_attn_bias" in name: - name_q = name.replace("attn_c_attn", "attention_wq") - name_k = name.replace("attn_c_attn", "attention_wk") - name_v = name.replace("attn_c_attn", "attention_wv") + elif "attn.c_attn.bias" in name: + name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj") + name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj") + name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj") q, k, v = torch.split( params, [ @@ -257,14 +258,14 @@ def convert_hf_model(model, dst_folder): q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q)) k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k)) v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v)) - elif "c_proj_bias" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.bias" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) - elif "c_proj_weight" in name: - name = name.replace("attn_c_proj", "attention_wo") + elif "attn.c_proj.weight" in name: + name = name.replace("attn.c_proj", "attn.c_attn.o_proj") params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) else: params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name)) model.lm_head.weight.detach().cpu().numpy().tofile( - os.path.join(dst_folder, "lm_head_weight") + os.path.join(dst_folder, "lm_head.weight") ) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index ac622b3337..132c50995b 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -28,44 +28,38 @@ ) from flexflow.core import * from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer +from peft import PeftModel, PeftConfig, LoraConfig from huggingface_hub import HfApi -import sys, torch, shutil, hashlib +import torch, shutil, hashlib, json, gc from typing import Union, List -class GenerationConfig: - """A class to store the sampling configs.""" - - def __init__( - self, - do_sample: bool = False, - temperature: float = 0.9, - topp: float = 0.8, - topk: int = 1, - ): - """Initialize the sampling configs - - :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False - :type do_sample: bool, optional - :param temperature: The temperature setting, defaults to 0.9 - :type temperature: float, optional - :param topp: The top probabilities (top-p) setting, defaults to 0.8 - :type topp: float, optional - :param topk: The top-k setting, defaults to 1 - :type topk: int, optional - """ - self.do_sample = do_sample - self.temperature = temperature - self.topp = topp - self.topk = topk - - -class GenerationResult: - """A class to store the output of a generation request.""" +class _SupportedModels: + def __init__(self,): + self.supported_models = { + "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), + "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), + "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), + "GPTBigCodeForCausalLM": ( + ModelType.STARCODER, + FlexFlowSTARCODER, + STARCODERConfig, + ), + "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig), + } - def __init__(self, text: str = None, tokens: list = None): - self.output_text = text - self.output_tokens = tokens + def get_ff_model_type(self, hf_config): + architectures = getattr(hf_config, "architectures", []) + ff_arch = None + if next(iter(architectures), None) is not None: + ff_arch = self.supported_models.get(architectures[0]) + if ff_arch is None: + raise ValueError( + f"Huggingface model of type {architectures} is not yet supported by FlexFlow" + ) + return ff_arch class LLM: @@ -92,68 +86,117 @@ def __init__( :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" :type output_file: str, optional """ - self.supported_models = { - "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), - "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), - "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig), - "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), - "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig), - "GPTBigCodeForCausalLM": ( - ModelType.STARCODER, - FlexFlowSTARCODER, - STARCODERConfig, - ), - "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig), - } + self.supported_models = _SupportedModels() self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) self.model_name = self.hf_config._name_or_path ( self.model_type, self.model_class, self.config_class, - ) = self.__get_ff_model_type() + ) = self.supported_models.get_ff_model_type(self.hf_config) self.data_type = data_type assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow" self.refresh_cache = refresh_cache self.output_file = output_file self.rm = None + self.pefts = {} def __del__(self): # Stop the background server before deleting the object if type(self) == LLM and self.rm is not None: self.rm.stop_server() - def __get_ff_model_type(self): - architectures = getattr(self.hf_config, "architectures", []) - ff_arch = None - if next(iter(architectures), None) is not None: - ff_arch = self.supported_models.get(architectures[0]) - if ff_arch is None: - print( - f"Huggingface model of type {architectures} is not yet supported by FlexFlow" + def add_peft(self, lora_config: LoraLinearConfig): + """Add a PEFT adapter to the LLM""" + if lora_config is None: + raise ValueError("lora_config cannot be None") + if len(lora_config.peft_model_id or "") == 0: + raise ValueError("PEFT model id cannot be empty") + # Inference (trainable=False): LoRA model should already exist in huggingface. Any changes of parameters from original model are ignored + # Training (trainable=True): Either an existing model (init_lora_weights=False) or a new one (init_lora_weights=True) + + if lora_config.trainable == False or not lora_config.init_lora_weights: + peft_config = PeftConfig.from_pretrained(lora_config.peft_model_id) + else: + peft_config = LoraConfig( + peft_type="LORA", + base_model_name_or_path=self.model_name, + r=lora_config.rank, + target_modules=lora_config.target_modules, + lora_alpha=lora_config.lora_alpha, + lora_dropout=lora_config.lora_dropout, + init_lora_weights=lora_config.init_lora_weights, ) - sys.exit(1) - return ff_arch + if peft_config.peft_type != "LORA": + raise RuntimeError( + f"PEFT type {peft_config.peft_type} not yet supported in FlexFlow" + ) + if "base_model_name_or_path" not in peft_config.to_dict(): + raise ValueError( + f"PEFT model {lora_config.peft_model_id} does not have an associated base model" + ) + if peft_config.base_model_name_or_path != self.model_name: + raise RuntimeError( + f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}" + ) + + self.pefts[lora_config] = { + "peft_config": peft_config, + "peft_type": peft_config.peft_type, + } + + def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID: + if lora_config is None: + raise ValueError("lora_config cannot be None") + if len(lora_config.peft_model_id or "") == 0: + raise ValueError("PEFT model id cannot be empty") + if lora_config not in self.pefts: + raise ValueError( + f"PEFT {lora_config} not registered with LLM {self.model_name}" + ) + if "ff_peft_model_id" not in self.pefts[lora_config]: + raise RuntimeError( + f"Attempting to run PEFT {lora_config} before compiling LLM {self.model_name}" + ) + + return self.pefts[lora_config]["ff_peft_model_id"] def download_hf_config(self): """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code.""" - self.config_dir = os.path.join( + config_dir = os.path.join( os.path.expanduser(self.cache_path), "configs", self.model_name.lower() ) - self.config_path = os.path.join(self.config_dir, "config.json") - os.makedirs(self.config_dir, exist_ok=True) - print(f"Creating directory {self.config_dir} (if it doesn't exist)...") - print(f"Saving {self.model_name} configs to file {self.config_path}...") - self.hf_config.to_json_file(self.config_path) + config_path = os.path.join(config_dir, "config.json") + os.makedirs(config_dir, exist_ok=True) + print(f"Creating directory {config_dir} (if it doesn't exist)...") + print(f"Saving {self.model_name} configs to file {config_path}...") + self.hf_config.to_json_file(config_path) + + # Save PEFT configs if the LLM has any registered PEFTs + for ff_peft_config, peft_dict in self.pefts.items(): + peft_config = peft_dict["peft_config"] + peft_model_id = ff_peft_config.peft_model_id + peft_config_dir = os.path.join( + os.path.expanduser(self.cache_path), "configs", peft_model_id.lower() + ) + os.makedirs(peft_config_dir, exist_ok=True) + peft_config_path = os.path.join(peft_config_dir, "config.json") + print(f"Saving {peft_model_id} configs to file {peft_config_path}...") + with open(peft_config_path, "w") as json_file: + + class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return super().default(obj) - def __get_revision_hashes(self, model_name: str, weights: bool): + json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) + + def __get_revision_hashes(self, model_name: str, folder: str): ff_revision = None - ff_revision_file = ( - os.path.join(self.weights_path, "rev_sha.txt") - if weights - else os.path.join(self.tokenizer_path, "rev_sha.txt") - ) + ff_revision_file = os.path.join(folder, "rev_sha.txt") + if os.path.exists(ff_revision_file): ff_revision = "".join(open(ff_revision_file).read().split()) @@ -173,65 +216,109 @@ def __get_revision_hashes(self, model_name: str, weights: bool): def download_hf_weights_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. If not, or if the refresh_cache parameter is set to True, download new weights. + + If any PEFT adapter is registered, perform the same operation for PEFT. """ - if self.data_type == DataType.DT_HALF: - torch.set_default_tensor_type(torch.HalfTensor) - elif self.data_type == DataType.DT_FLOAT: - torch.set_default_tensor_type(torch.FloatTensor) - else: - assert False, "Data type not yet supported -- cannot download weights!" - # Use local cache, or download new version - self.weights_path = os.path.join( - os.path.expanduser(self.cache_path), - "weights", - self.model_name.lower(), - ( - "full-precision" - if self.data_type == DataType.DT_FLOAT - else "half-precision" - ), - ) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..." + def get_weights_path(model_name): + return os.path.join( + os.path.expanduser(self.cache_path), + "weights", + model_name.lower(), + ( + "full-precision" + if self.data_type == DataType.DT_FLOAT + else "half-precision" + ), ) - if os.path.exists(self.weights_path): - shutil.rmtree(self.weights_path) - os.makedirs(self.weights_path, exist_ok=True) - print(f"Creating directory {self.weights_path} (if it doesn't exist)...") - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=True - ) - - # Download if needed - if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model + def refresh_cache_if_needed(model_name): + weights_path = get_weights_path(model_name) + if self.refresh_cache: print( - f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..." + f"Refreshing weights in cache for model {model_name} at path {weights_path} ..." ) - else: - # Remote model + if os.path.exists(weights_path): + shutil.rmtree(weights_path) + os.makedirs(weights_path, exist_ok=True) + + def get_hf_llm(model_name): + return AutoModelForCausalLM.from_pretrained( + model_name, + trust_remote_code=True, + torch_dtype=( + torch.float32 + if self.data_type == DataType.DT_FLOAT + else torch.float16 + ), + ) + + def download_llm_weights(): + refresh_cache_if_needed(self.model_name) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + self.model_name, self.weights_path + ) + if ff_revision != latest_revision: print( - f"'{self.model_name}' local model weights were updated! Converting new weights now..." + f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." ) - # Download model from HuggingFace, or load it from the local folder - hf_model = AutoModelForCausalLM.from_pretrained( - self.model_name, trust_remote_code=True - ) - # Print log message to notify user download of model has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading HF weights. Converting them now...") - # Convert the model to FlexFlow format - self.model_class.convert_hf_model(hf_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print("Done converting the weights...") - else: - print(f"Loading '{self.model_name}' model weights from the cache...") + hf_model = get_hf_llm(self.model_name) + # Convert the model to FlexFlow format + self.model_class.convert_hf_model(hf_model, self.weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {self.model_name}") + # Deallocate hf model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + def convert_peft_model(hf_peft_model, peft_type, weights_path): + for name, params in hf_peft_model.named_parameters(): + if peft_type.lower() in name: + name = name.replace("base_model.model.model.", "").replace( + ".default", "" + ) + name = self.model_class.convert_hf_weight_name(name) + params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + + def download_peft_weights(): + for ff_peft_config, peft_dict in self.pefts.items(): + if not ff_peft_config.init_lora_weights: + peft_config = peft_dict["peft_config"] + peft_type = peft_dict["peft_type"] + peft_model_id = ff_peft_config.peft_model_id + + weights_path = get_weights_path(peft_model_id) + refresh_cache_if_needed(peft_model_id) + ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( + peft_model_id, weights_path + ) + + if ff_revision != latest_revision: + print( + f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..." + ) + hf_model = get_hf_llm(peft_model_id) + hf_peft_model = PeftModel.from_pretrained( + hf_model, peft_model_id, config=peft_config + ) + # Convert the model to FlexFlow format + convert_peft_model(hf_peft_model, peft_type, weights_path) + # Save new revision hash to file + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + print(f"Done converting the weights for model {peft_model_id}") + # Deallocate hf model + del hf_peft_model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + self.weights_path = get_weights_path(self.model_name) + download_llm_weights() + download_peft_weights() def download_hf_tokenizer_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date. @@ -241,13 +328,11 @@ def download_hf_tokenizer_if_needed(self): # Use local cache, or download new version self.tokenizer_path = os.path.join( - os.path.expanduser(self.cache_path), - "tokenizers", - self.model_name.lower(), + os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower() ) if self.refresh_cache: print( - f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..." + f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..." ) if os.path.exists(self.tokenizer_path): shutil.rmtree(self.tokenizer_path) @@ -257,46 +342,29 @@ def download_hf_tokenizer_if_needed(self): # Get local revision SHA, check if it matches latest one on huggingface ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, weights=False + self.model_name, self.tokenizer_path ) if ff_revision != latest_revision: - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - # Local model - print( - f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..." - ) - else: - # Remote model - print( - f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..." - ) + print( + f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." + ) # Download tokenizer from HuggingFace, or load it from the local folder - if self.model_type == ModelType.LLAMA: - hf_tokenizer = LlamaTokenizer.from_pretrained( - self.model_name, use_fast=True - ) - else: - hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name) - # Print log message to notify user download of tokenizer has finished - if not os.path.exists(self.model_name) or os.path.isdir(self.model_name): - print("Done downloading tokenizer. Saving it now...") + hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) # Save tokenizer hf_tokenizer.save_pretrained(self.tokenizer_path) - print("Done saving HF tokenizer.") + print("Done updating HF tokenizer.") # Save new revision hash to file with open(ff_revision_file, "w+") as f: f.write(latest_revision) - else: - print(f"Loading '{self.model_name}' tokenizer from the cache...") - def compile( self, generation_config: GenerationConfig = GenerationConfig(), max_requests_per_batch: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, + enable_peft_finetuning: bool = False, model_specific_data_parallelism_degree: int = None, model_specific_tensor_parallelism_degree: int = None, model_specific_pipeline_parallelism_degree: int = None, @@ -312,6 +380,8 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 :type max_tokens_per_batch: int, optional + :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False + :type enable_peft_finetuning: bool, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None :type model_specific_data_parallelism_degree: int, optional :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None @@ -321,9 +391,6 @@ def compile( :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ - # self.max_requests_per_batch = max_requests_per_batch - # self.max_seq_length = max_seq_length - # self.max_tokens_per_batch = max_tokens_per_batch self.ssms = ssms self.generation_config = GenerationConfig() self.ffconfig = FFConfig() @@ -355,6 +422,7 @@ def compile( self.rm.set_max_requests_per_batch(max_requests_per_batch) self.rm.set_max_tokens_per_batch(max_tokens_per_batch) self.rm.set_max_sequence_length(max_seq_length) + self.rm.set_enable_peft_finetuning(enable_peft_finetuning) # Instantiate the relevant model self.model = self.model_class( @@ -366,16 +434,27 @@ def compile( max_tokens_per_batch, ) + # Download the config from huggingface + self.download_hf_config() + + # Download the tokenizer from huggingface (if needed) and load them + self.download_hf_tokenizer_if_needed() + # Download the weights from huggingface (if needed) self.download_hf_weights_if_needed() + # Add PEFT layer if registered + for ff_peft_config, peft_dict in self.pefts.items(): + ff_peft_config.ff_compile() + ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config) + peft_dict["ff_peft_model_id"] = ff_peft_model_id + # Create file data loader, load weights into tensors model_configs = self.config_class(self.hf_config) self.rm.set_max_spec_tree_token_num( model_configs.max_spec_tree_token_num - if "max_spec_tree_token_num" - in model_configs.__dict__ + if "max_spec_tree_token_num" in model_configs.__dict__ else 20 ) @@ -393,9 +472,6 @@ def compile( self.im = InferenceManager() self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader) - # Download the tokenizer from huggingface (if needed) and load them - self.download_hf_tokenizer_if_needed() - # Create tokenizer (this must be done after we have downloaded the tokenizer bos_token_id = ( -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id @@ -419,22 +495,36 @@ def compile( atexit.register(self.rm.stop_server) - def generate(self, prompts: Union[str, List[str]], max_length: int = 128): + def generate( + self, + requests_or_prompts: Union[str, List[str], Request, List[Request]], + max_length: int = 128, + ): """Generate tokens based on the input prompt(s) - :param prompts: The generation prompt(s) in the form of a string, or list of strings - :type prompts: Union[str, List[str]] + :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests + :type requests_or_prompts: Union[str, List[str], Request, List[Request]] :return: the generation results :rtype: GenerationResult """ - if type(prompts) == str: - if len(prompts) == 0: + if type(requests_or_prompts) == str: + if len(requests_or_prompts) == 0: return None - return self.model.ffmodel.generate([prompts], max_length) - elif type(prompts) == list: - if len(prompts) == 0: + return self.model.ffmodel.generate_inf_only( + [requests_or_prompts], max_length + ) + elif type(requests_or_prompts) == Request: + return self.model.ffmodel.generate(requests_or_prompts) + elif type(requests_or_prompts) == list: + if len(requests_or_prompts) == 0: return [] - return self.model.ffmodel.generate(prompts, max_length) + if type(requests_or_prompts[0]) == str: + return self.model.ffmodel.generate_inf_only( + requests_or_prompts, max_length + ) + else: + print(requests_or_prompts) + return self.model.ffmodel.generate(requests_or_prompts) else: assert False, "Please pass a non-empty string or list of strings" @@ -446,17 +536,6 @@ def stop_server(self): self.rm.stop_server() print("Background server stopped.") - def __enter__(self): - # Start the server when entering the context - # self.rm.start_server(self.model.ffmodel) - return self - - def __exit__(self, exc_type, exc_value, traceback): - # Stop the server when exiting the context - # self.rm.stop_server() - if exc_type: - print(f"Exception occurred: {exc_value}") - class SSM(LLM): """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace""" @@ -482,13 +561,7 @@ def __init__( :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to "" :type output_file: str, optional """ - super().__init__( - model_name, - data_type, - cache_path, - refresh_cache, - output_file, - ) + super().__init__(model_name, data_type, cache_path, refresh_cache, output_file) def compile( self, @@ -496,15 +569,13 @@ def compile( max_requests_per_batch: int = 16, max_seq_length: int = 256, max_tokens_per_batch: int = 128, + enable_peft_finetuning: bool = False, model_specific_data_parallelism_degree: int = 1, model_specific_tensor_parallelism_degree: int = 1, model_specific_pipeline_parallelism_degree: int = 1, ssms: list = [], ): """Compile the SSM for inference and load the weights into memory - - :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE - :type mode: InferenceMode, optional :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig() :type generation_config: GenerationConfig, optional :param max_requests_per_batch: The maximum batch size to allow, defaults to 16 @@ -513,6 +584,8 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128 :type max_tokens_per_batch: int, optional + :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False + :type enable_peft_finetuning: bool, optional :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 :type model_specific_data_parallelism_degree: int, optional :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1 @@ -527,6 +600,7 @@ def compile( max_requests_per_batch, max_seq_length, max_tokens_per_batch, + enable_peft_finetuning, model_specific_data_parallelism_degree, model_specific_tensor_parallelism_degree, model_specific_pipeline_parallelism_degree, diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 994a85f57e..0f4726837c 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -46,6 +46,12 @@ class LossType(Enum): LOSS_IDENTITY = 54 +class OptimizerType(Enum): + OPTIMIZER_TYPE_NONE = 60 + OPTIMIZER_TYPE_SGD = 61 + OPTIMIZER_TYPE_ADAM = 62 + + class CompMode(Enum): TRAINING = 70 INFERENCE = 71 @@ -153,6 +159,11 @@ class OpType(Enum): RESIDUAL_LAYERNORM = 2306 +class RequestType(Enum): + REQ_INFERENCE = 4001 + REQ_FINETUNING = 4002 + + def enum_to_int(enum, enum_item): for item in enum: if enum_item == item: diff --git a/rdelacou/generate_trace.py b/rdelacou/generate_trace.py new file mode 100644 index 0000000000..986dab37df --- /dev/null +++ b/rdelacou/generate_trace.py @@ -0,0 +1,121 @@ +import pandas as pd +from math import ceil +from random import shuffle, uniform +import json, pickle, requests, os, argparse + +class TraceBuilder(object): + + # trace_type: either "conv" or "code" + def __init__(self, import_times=True, import_prompts=True): + self.req_times = None + self.imported_req_times = False + self.prompt_data = None + self.imported_prompt_data = False + if import_times: + self.import_trace_timestamps() + if import_prompts: + self.import_prompt_data() + + def import_trace_timestamps(self, trace_type="conv"): + if not self.imported_req_times: + # Import Microsoft LLM 1 hour trace + df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"]) + req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds + req_times = req_times - req_times.min() + self.req_times = req_times.tolist() + self.imported_req_times = True + + def import_prompt_data(self, shuffle_=True): + if not self.imported_prompt_data: + sharegpt_filename = "sharegpt_opt_text_completion_length.pkl" + sharegpt_filepath = f"./{sharegpt_filename}" + if os.path.exists(sharegpt_filepath): + os.remove("sharegpt_opt_text_completion_length.pkl") + sharegpt_url = f"https://github.com/sosp-ae-39/sosp-ae-astra/raw/main/datasets/{sharegpt_filename}" + response = requests.get(sharegpt_url) + with open(sharegpt_filename, "wb") as file: + file.write(response.content) + with open(sharegpt_filepath, 'rb') as f: + data2 = pickle.load(f) + os.remove("sharegpt_opt_text_completion_length.pkl") + + prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + + for pair in data2: + assert(len(pair) == 2) + + prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048] + num_pairs = len(prompt_lengths) + assert(num_pairs == len(generation_lengths)) + print("Number of conversation pairs: ", num_pairs) + + print(f"Prompt lengths: min={min(prompt_lengths)}, max={max(prompt_lengths)}, avg={sum(prompt_lengths)/len(prompt_lengths)}") + print(f"Generation lengths: min={min(generation_lengths)}, max={max(generation_lengths)}, avg={sum(generation_lengths)/len(generation_lengths)}") + total_lengths = [prompt_lengths[i] + generation_lengths[i] for i in range(len(prompt_lengths))] + print(f"Total lengths: min={min(total_lengths)}, max={max(total_lengths)}, avg={sum(total_lengths)/len(total_lengths)}") + + self.prompt_data = [{"human": prompt_lengths[i], "gpt": generation_lengths[i]} for i in range(num_pairs)] + + if shuffle_: + shuffle(self.prompt_data) + self.imported_prompt_data = True + + # Delta is in seconds + # Rate is in req per second + def generate_trace(self, target_arrival_rate=10, debug_verbose=False): + self.import_trace_timestamps() + self.import_prompt_data() + + microsec = 1000000 + avg_arrival_rate = len(self.req_times) / (self.req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude + if debug_verbose: + print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate) + scale_factor = float(target_arrival_rate) / avg_arrival_rate + if debug_verbose: + print("Scale factor to obtain target arrival rate: ", scale_factor) + + # Buckets are 1 second timeframes + nb_buckets = ceil(self.req_times[-1] / microsec) + buckets = [] + j = 0 + k = 0 + for i in range(nb_buckets): + bucket_size = 0 + while(j < len(self.req_times) and self.req_times[j] >= i*microsec and self.req_times[j] < (i+1)*microsec): + bucket_size += 1 + j += 1 + bucket_size = bucket_size*scale_factor + prob = bucket_size - int(bucket_size) + bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob) + + # If used all of the prompt data, loop back at the beggining and reuse some prompts + if k+bucket_size > len(self.prompt_data): + bucket = self.prompt_data[k:] + self.prompt_data[:(k+bucket_size)%len(self.prompt_data)] + else: + bucket = self.prompt_data[k:k+bucket_size] + k = (k+bucket_size) % len(self.prompt_data) + buckets.append(bucket) + + if debug_verbose: + print("Avg arrival rate obtained (req/s): ", sum([len(b) for b in buckets])/len(buckets)) + return buckets + +def generate_and_save_trace(arrival_rate, output_file): + builder = TraceBuilder() + trace = builder.generate_trace(target_arrival_rate=arrival_rate, debug_verbose=True) + with open(output_file, 'w+') as f: + json.dump(trace, f, indent=2) + +if __name__ == '__main__': + # Set up the argument parser + parser = argparse.ArgumentParser(description='Generate and save a trace.') + parser.add_argument('--arrival-rate', type=float, default=10.0, help='The target arrival rate for the trace.') + parser.add_argument('--output-file', type=str, default='sharegpt.json', help='The path to the output file to save the trace.') + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the function with the user-provided arrival rate + generate_and_save_trace(args.arrival_rate, args.output_file) diff --git a/requirements.txt b/requirements.txt index ad65622367..64f1808934 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,11 @@ transformers>=4.31.0 sentencepiece einops pip +# peft-related +scipy +bitsandbytes +datasets +accelerate +loralib +triton +peft diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 5714c8fe3d..e39cb29037 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -67,6 +67,13 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *); FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *); FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *); + // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_optimizer_config_t, LoraOptimizerConfig + // *); FF_NEW_OPAQUE_WRAPPER(flexflow_lora_sgd_optimizer_config_t, + // LoraSGDOptimizerConfig *); + // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_adam_optimizer_config_t, + // LoraAdamOptimizerConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *); + FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *); }; Logger ffc_log("flexflow_c"); @@ -649,6 +656,7 @@ flexflow_tensor_t * bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); const Tensor input = FFCObjectWrapper::unwrap(input_); @@ -672,6 +680,7 @@ flexflow_tensor_t * elementwise_affine, eps, use_bias, + inplace_residual, input->data_type, name); assert(tensor_outputs[0] != nullptr); @@ -679,7 +688,7 @@ flexflow_tensor_t * DEBUG_PRINT("[ResidualLayerNorm] input %p, residual1 %p, residual2 " "%p, output0: %p, " "output1: %p, use_two_residuals: %d, elementwise_affine %d, eps " - "%f, use_bias: %d, name %s", + "%f, use_bias: %d, inplace_residual: %d, name %s", input, residual1, residual2, @@ -689,6 +698,7 @@ flexflow_tensor_t * elementwise_affine, eps, use_bias, + inplace_residual, name); flexflow_tensor_t *tensor_outputs_wrapped = (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); @@ -706,6 +716,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); const Tensor input = FFCObjectWrapper::unwrap(input_); @@ -722,13 +733,14 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( elementwise_affine, eps, use_bias, + inplace_residual, input->data_type, name); assert(tensor_outputs[0] != nullptr); assert(tensor_outputs[1] != nullptr); DEBUG_PRINT("[AddBiasResidualLayerNorm] input %p, residual %p, output0: %p, " "output1: %p, elementwise_affine %d, eps " - "%f, use_bias %d, name %s", + "%f, use_bias %d, inplace_residual: %d, name %s", input, residual, tensor_outputs[0], @@ -736,6 +748,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm( elementwise_affine, eps, use_bias, + inplace_residual, name); flexflow_tensor_t *tensor_outputs_wrapped = (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t)); @@ -1469,13 +1482,20 @@ flexflow_tensor_t * const flexflow_tensor_t input2_, float eps, int dim, + bool inplace_residual, char const *name) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); Tensor input1 = FFCObjectWrapper::unwrap(input1_); Tensor input2 = FFCObjectWrapper::unwrap(input2_); Tensor tensor_outputs[2]; - handle->residual_rms_norm( - input1, input2, tensor_outputs, eps, dim, input1->data_type, name); + handle->residual_rms_norm(input1, + input2, + tensor_outputs, + eps, + dim, + inplace_residual, + input1->data_type, + name); assert(tensor_outputs[0] != nullptr); assert(tensor_outputs[1] != nullptr); flexflow_tensor_t *tensor_outputs_wrapped = @@ -1529,6 +1549,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +flexflow_peft_model_id_t flexflow_model_add_lora_layer( + flexflow_model_t handle_, + const flexflow_lora_linear_config_t peft_config_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); + PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config); + + DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, " + "peft_model_id: %p", + handle, + peft_config, + peft_model_id); + return FFCObjectWrapper::wrap(peft_model_id); +} + void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); @@ -1584,39 +1619,83 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { void flexflow_model_generate(flexflow_model_t handle_, int num_requests, + enum RequestType *request_types, char const **input_texts, - int max_num_chars, char **output_texts, - int max_seq_length, - int **output_length_and_tokens) { + int *max_seq_lengths, + flexflow_peft_model_id_t *peft_model_ids, + char const **dataset_filepaths, + int *training_steps, + int **output_length_and_tokens, + int *num_finetuning_losses, + float *finetuning_losses) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); - std::vector prompts; + std::vector requests; + for (int i = 0; i < num_requests; i++) { - std::string const text_str(input_texts[i]); - prompts.push_back(text_str); - DEBUG_PRINT("[Model] generate[%d] %p %s %i", - i, - handle, - text_str.c_str(), - max_seq_length); + if (request_types[i] == RequestType::REQ_INFERENCE) { + std::string const text_str(input_texts[i]); + Request inference_req; + inference_req.prompt = text_str; + inference_req.max_sequence_length = max_seq_lengths[i]; + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { + inference_req.peft_model_id = *peft_model_id; + } + requests.push_back(inference_req); + DEBUG_PRINT("[Model] generate[%d] %p %s %i", + i, + handle, + text_str.c_str(), + max_seq_lengths[i]); + } else if (request_types[i] == RequestType::REQ_FINETUNING) { + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.max_sequence_length = max_seq_lengths[i]; + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); + if (peft_model_id != nullptr) { + fine_tuning_req.peft_model_id = *peft_model_id; + } + std::string const dataset_fp(dataset_filepaths[i]); + fine_tuning_req.dataset_filepath = dataset_fp; + fine_tuning_req.max_training_steps = training_steps[i]; + requests.push_back(fine_tuning_req); + DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i", + i, + handle, + dataset_fp.c_str(), + max_seq_lengths[i], + training_steps[i]); + } else { + assert(false && "Unknown request type"); + } } - std::vector results = - handle->generate(prompts, max_seq_length); - // If the prompt exceeds max seq len, check that we return the prompt with no - // additional token. Otherwise, check that the output does not exceed the max - // sequence length. + + std::vector results = handle->generate(requests); + for (int i = 0; i < num_requests; i++) { - assert(results[i].output_tokens.size() <= max_seq_length || - results[i].output_tokens.size() == results[i].input_tokens.size()); - output_length_and_tokens[i][0] = results[i].output_tokens.size(); - std::copy(results[i].output_tokens.begin(), - results[i].output_tokens.end(), - output_length_and_tokens[i] + 1); - std::memcpy(output_texts[i], - results[i].output_text.c_str(), - results[i].output_text.length()); + if (request_types[i] == RequestType::REQ_INFERENCE) { + // If the prompt exceeds max seq len, check that we return the prompt with + // no additional token. Otherwise, check that the output does not exceed + // the max sequence length. + assert(results[i].output_tokens.size() <= max_seq_lengths[i] || + results[i].output_tokens.size() == results[i].input_tokens.size()); + output_length_and_tokens[i][0] = results[i].output_tokens.size(); + std::copy(results[i].output_tokens.begin(), + results[i].output_tokens.end(), + output_length_and_tokens[i] + 1); + std::memcpy(output_texts[i], + results[i].output_text.c_str(), + results[i].output_text.length()); + } else if (request_types[i] == RequestType::REQ_FINETUNING) { + assert(results[i].finetuning_losses.size() > 0); + *num_finetuning_losses = results[i].finetuning_losses.size(); + // *finetuning_losses = results[i].finetuning_losses.data(); + std::memcpy(finetuning_losses, + results[i].finetuning_losses.data(), + results[i].finetuning_losses.size() * sizeof(float)); + } } - // return FFCObjectWrapper::wrap(&results[0]); } void flexflow_model_set_position_offset(flexflow_model_t handle_, @@ -2597,6 +2676,14 @@ void flexflow_request_manager_set_max_sequence_length( DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length); } +void flexflow_request_manager_set_enable_peft_finetuning( + flexflow_request_manager_t handle_, bool enable_peft_finetuning_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_enable_peft_finetuning(enable_peft_finetuning_); + DEBUG_PRINT("[RequestManager] set_enable_peft_finetuning %d", + enable_peft_finetuning_); +} + void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, @@ -2730,3 +2817,238 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, FFModel *model = FFCObjectWrapper::unwrap(model_handle_); handle->load_weights(model); } + +// // ----------------------------------------------------------------------- +// // LoraSGDOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_sgd_optimizer_config_t +// flexflow_lora_sgd_optimizer_config_create( +// double lr, double momentum, bool nesterov, bool weight_decay) { +// LoraSGDOptimizerConfig *handle = +// new LoraSGDOptimizerConfig(lr, momentum, nesterov, weight_decay); +// DEBUG_PRINT("[LoraSGDOptimizerConfig] new %p", handle); +// return FFCObjectWrapper::wrap(handle); +// } + +// void flexflow_lora_sgd_optimizer_config_destroy( +// flexflow_lora_sgd_optimizer_config_t handle_) { +// LoraSGDOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_); +// DEBUG_PRINT("[LoraSGDOptimizerConfig] delete %p", handle); +// delete handle; +// } + +// // ----------------------------------------------------------------------- +// // LoraAdamOptimizerConfig +// // ----------------------------------------------------------------------- + +// flexflow_lora_adam_optimizer_config_t +// flexflow_lora_adam_optimizer_config_create(double alpha, +// double beta1, +// double beta2, +// double weight_decay, +// double epsilon) { +// LoraAdamOptimizerConfig *handle = +// new LoraAdamOptimizerConfig(alpha, beta1, beta2, weight_decay, +// epsilon); +// DEBUG_PRINT("[LoraAdamOptimizerConfig] new %p", handle); +// return FFCObjectWrapper::wrap(handle); +// } + +// void flexflow_lora_adam_optimizer_config_destroy( +// flexflow_lora_adam_optimizer_config_t handle_) { +// LoraAdamOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_); +// DEBUG_PRINT("[LoraAdamOptimizerConfig] delete %p", handle); +// delete handle; +// } + +// ----------------------------------------------------------------------- +// LoraLinearConfig +// ----------------------------------------------------------------------- + +flexflow_lora_linear_config_t + flexflow_lora_linear_config_create(char const *cache_folder_, + char const *peft_model_id_, + bool trainable, + bool init_lora_weights, + char const *base_model_name_or_path_, + char const *precision_, + int rank, + float lora_alpha, + float lora_dropout, + int num_target_modules, + char const **target_modules_, + enum OptimizerType optimizer_type, + float sgd_learning_rate, + float sgd_momentum, + bool sgd_nesterov, + float sgd_weight_decay, + float adam_alpha, + float adam_beta1, + float adam_beta2, + float adam_weight_decay, + float adam_epsilon) { + assert(cache_folder_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(peft_model_id_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(base_model_name_or_path_ != nullptr && + "Cannot convert nullptr char * to std::string"); + assert(precision_ != nullptr && + "Cannot convert nullptr char * to std::string"); + std::string const cache_folder(cache_folder_); + std::string const peft_model_id(peft_model_id_); + LoraOptimizerConfig *optim_config = nullptr; + if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_SGD) { + optim_config = new LoraSGDOptimizerConfig( + sgd_learning_rate, sgd_momentum, sgd_nesterov, sgd_weight_decay); + } else if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_ADAM) { + optim_config = new LoraAdamOptimizerConfig( + adam_alpha, adam_beta1, adam_beta2, adam_weight_decay, adam_epsilon); + } + std::vector target_modules; + for (int i = 0; i < num_target_modules; i++) { + std::string const target_module(target_modules_[i]); + target_modules.push_back(target_module); + } + std::string const base_model_name_or_path(base_model_name_or_path_); + std::string const precision(precision_); + LoraLinearConfig *handle = new LoraLinearConfig(cache_folder, + peft_model_id, + trainable, + optim_config, + init_lora_weights, + base_model_name_or_path, + precision, + rank, + lora_alpha, + lora_dropout, + target_modules); + DEBUG_PRINT("[LoraLinearConfig] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_lora_linear_config_destroy( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config); + delete peft_config; +} + +char const *flexflow_lora_linear_config_get_cache_folder( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->cache_folder.c_str(); +} + +char const *flexflow_lora_linear_config_get_peft_model_id( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->peft_model_id.c_str(); +} + +int flexflow_lora_linear_config_get_rank( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->rank; +} + +float flexflow_lora_linear_config_get_lora_alpha( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->lora_alpha; +} + +float flexflow_lora_linear_config_get_lora_dropout( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->lora_dropout; +} + +bool flexflow_lora_linear_config_get_trainable( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->trainable; +} + +bool flexflow_lora_linear_config_get_init_lora_weights( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->init_lora_weights; +} + +char const **flexflow_lora_linear_config_get_target_modules( + flexflow_lora_linear_config_t handle_, int *num_target_modules) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + *num_target_modules = handle->target_modules.size(); + static std::vector target_modules_; + target_modules_.clear(); + for (auto const &target_module : handle->target_modules) { + target_modules_.push_back(target_module.c_str()); + } + return target_modules_.data(); +} + +char const *flexflow_lora_linear_config_get_base_model_name_or_path( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->base_model_name_or_path.c_str(); +} + +char const *flexflow_lora_linear_config_get_precision( + flexflow_lora_linear_config_t handle_) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->precision.c_str(); +} + +void flexflow_lora_linear_config_set_lora_alpha( + flexflow_lora_linear_config_t handle_, float value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->lora_alpha = value; +} + +void flexflow_lora_linear_config_set_lora_dropout( + flexflow_lora_linear_config_t handle_, float value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->lora_dropout = value; +} + +void flexflow_lora_linear_config_set_trainable( + flexflow_lora_linear_config_t handle_, bool value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->trainable = value; +} + +void flexflow_lora_linear_config_set_init_lora_weights( + flexflow_lora_linear_config_t handle_, bool value) { + LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_); + handle->init_lora_weights = value; +} + +// ----------------------------------------------------------------------- +// PEFTModelID +// ----------------------------------------------------------------------- + +flexflow_peft_model_id_t flexflow_peft_model_id_create() { + PEFTModelID *handle = new PEFTModelID(); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) { + PEFTModelID *handle = new PEFTModelID(id); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +flexflow_peft_model_id_t flexflow_peft_model_id_no_id() { + PEFTModelID *handle = const_cast(&PEFTModelID::NO_ID); + DEBUG_PRINT("[PEFTModelID] new %p", handle); + return FFCObjectWrapper::wrap(handle); +} + +void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { + PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_); + DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id); + delete peft_model_id; +} diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp index a87aaade84..99c13f5a67 100644 --- a/src/loss_functions/loss_functions.cpp +++ b/src/loss_functions/loss_functions.cpp @@ -86,7 +86,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( num_classes, k); // Scale logit gradients by op->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, @@ -116,7 +116,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, @@ -146,7 +146,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, @@ -173,7 +173,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor - hipLaunchKernelGGL(scale_kernel, + hipLaunchKernelGGL(scale_kernel, GET_BLOCKS(loss_grad_volume), CUDA_NUM_THREADS, 0, diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu index f78311980c..636ef9c4c3 100644 --- a/src/loss_functions/loss_functions.cu +++ b/src/loss_functions/loss_functions.cu @@ -81,7 +81,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, label_ptr, num_samples, num_classes, k); // Scale logit gradients by op->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor * k); } void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( @@ -100,7 +100,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper( logit_grad_ptr, logit_ptr, label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor); } void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( @@ -119,7 +119,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper( logit_grad_ptr, logit_ptr, label_ptr, logit_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - logit_grad_ptr, logit_grad_volume, 0, scale_factor); + logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor); } void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, @@ -135,7 +135,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr, stream>>>(loss_grad_ptr, loss_ptr, loss_volume); // Scale logit gradients by loss->scale_factor scale_kernel<<>>( - loss_grad_ptr, loss_grad_volume, 0, scale_factor); + loss_grad_ptr, loss_grad_volume, 0.0f, scale_factor); } }; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index a17e156f18..7a1da2e974 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -43,7 +43,8 @@ bool operator==(AddBiasResidualLayerNormParams const &lhs, AddBiasResidualLayerNormParams const &rhs) { return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && lhs.elementwise_affine == rhs.elementwise_affine && - lhs.use_bias == rhs.use_bias; + lhs.use_bias == rhs.use_bias && + lhs.inplace_residual == rhs.inplace_residual; } bool AddBiasResidualLayerNormParams::is_valid( @@ -58,7 +59,8 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; - if (this->name != nullptr) { + params.inplace_residual = this->inplace_residual; + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -71,6 +73,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of @@ -171,6 +174,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input, ln->add_int_property("use_bias", use_bias); ln->add_int_vector_property("axes", axes); ln->add_float_property("eps", eps); + ln->add_int_property("inplace_residual", inplace_residual); layers.push_back(ln); outputs[0] = ln->outputs[0]; outputs[1] = ln->outputs[1]; @@ -189,6 +193,8 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer( layer->get_int_vector_property("axes", axes); float eps; layer->get_float_property("eps", eps); + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; return new AddBiasResidualLayerNorm(model, layer->layer_guid, inputs[0], @@ -197,6 +203,7 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer( elementwise_affine, use_bias, eps, + inplace_residual, false, // allocate_weights layer->name); } @@ -215,6 +222,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( params.elementwise_affine, params.use_bias, params.eps, + params.inplace_residual, allocate_weights, params.name) {} @@ -227,6 +235,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -239,7 +248,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm( _input, _residual), elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), - use_bias(_use_bias) { + use_bias(_use_bias), inplace_residual(_inplace_residual) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -348,48 +357,57 @@ void AddBiasResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } // attn output - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + // added: attn_output + attn final bias + residual + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -397,7 +415,7 @@ void AddBiasResidualLayerNorm::init_inference( READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -420,48 +438,56 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - // attn output - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); + } + // input: attn output + // added: attn_output + attn final bias + residual + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[1]->region)); - launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); // attn final bias launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (use_bias) { launcher.add_region_requirement(RegionRequirement(weights[2]->part, @@ -469,7 +495,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } FutureMap fm = runtime->execute_index_space(ctx, launcher); @@ -478,13 +504,11 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) { } /* - regions[0](I): attn output - regions[1](I): residual - regions[2](O): added output (attn output + final attn bias + residual) - regions[3](O): layer norm output - regions[4](I): final attn bias - regions[5](I): gamma - regions[6](I): beta + regions[0](I/O): attn output AND added output (attn output + final attn bias + + residual) regions[1](I): residual regions[2](O): layer norm output + regions[3](I): final attn bias + regions[4](I): gamma + regions[5](I): beta */ OpMeta *AddBiasResidualLayerNorm::init_task( Task const *task, @@ -517,10 +541,6 @@ void AddBiasResidualLayerNorm::forward(FFModel const &ff) { assert(false); } -void AddBiasResidualLayerNorm::backward(FFModel const &ff) { - assert(false); -} - FutureMap AddBiasResidualLayerNorm::inference( FFModel const &ff, BatchConfigFuture const &bc, @@ -546,69 +566,94 @@ FutureMap AddBiasResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - // attn output - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + // input + // added_output: input + attn bias + residual + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + // attn bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + weights[0]->region)); + launcher.add_field(fid++, FID_DATA); // residual launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - // added: attn_output + attn final bias + residual - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); - // layer norm output + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } + // output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); - // attn final bias - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); if (elementwise_affine) { + // gamma launcher.add_region_requirement(RegionRequirement(weights[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[1]->region)); - launcher.add_field(5, FID_DATA); - + launcher.add_field(fid++, FID_DATA); if (use_bias) { + // beta launcher.add_region_requirement(RegionRequirement(weights[2]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[2]->region)); - launcher.add_field(6, FID_DATA); + launcher.add_field(fid++, FID_DATA); } } return runtime->execute_index_space(ctx, launcher); } +void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } +} + /* - regions[0](I): attn output - regions[1](I): residual - regions[2](O): added output (attn output + final attn bias + residual) - regions[3](O): layer norm output - regions[4](I): final attn bias - regions[5](I): gamma - regions[6](I): beta + regions[0](I): input / added output + regions[1](I): attn bias + regions[2](I): residual + regions[3](O): output + regions[4](I): gamma + regions[5](I): beta */ void AddBiasResidualLayerNorm::inference_task( Task const *task, @@ -626,30 +671,72 @@ void AddBiasResidualLayerNorm::inference_task( *((AddBiasResidualLayerNormMeta **)task->local_args); assert(regions.size() == - 5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO( - m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); - GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + 4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); + + int rid = 0, tid = 0, did = 0; + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(m->input_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR attn_bias = + helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR residual = + helperGetGenericTensorAccessorRO(m->input_type[1], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW added_output; + if (m->inplace_residual) { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorW output = + helperGetGenericTensorAccessorWO(m->output_type[1], + regions[rid++], + task->regions[tid++], + FID_DATA, + ctx, + runtime); GenericTensorAccessorR gamma, beta; Domain in_domain = runtime->get_index_space_domain( - ctx, task->regions[0].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); + Domain attn_bias_domain = runtime->get_index_space_domain( + ctx, task->regions[did++].region.get_index_space()); Domain residual_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); - Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); + Domain added_out_domain; + if (m->inplace_residual) { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + } else { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[did++].region.get_index_space()); + } Domain out_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - Domain attn_bias_domain = runtime->get_index_space_domain( - ctx, task->regions[4].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); + Domain gamma_domain, beta_domain; assert(in_domain.get_volume() == out_domain.get_volume()); @@ -673,23 +760,23 @@ void AddBiasResidualLayerNorm::inference_task( if (m->elementwise_affine) { gamma = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[5], - task->regions[5], + regions[rid++], + task->regions[tid++], FID_DATA, ctx, runtime); gamma_domain = runtime->get_index_space_domain( - ctx, task->regions[5].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); if (m->use_bias) { beta = helperGetGenericTensorAccessorRO(m->weight_type[2], - regions[6], - task->regions[6], + regions[rid++], + task->regions[tid++], FID_DATA, ctx, runtime); beta_domain = runtime->get_index_space_domain( - ctx, task->regions[6].region.get_index_space()); + ctx, task->regions[did++].region.get_index_space()); assert(gamma_domain == beta_domain); } @@ -707,16 +794,7 @@ void AddBiasResidualLayerNorm::inference_task( } AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - (int)attn_bias_dim, - (int)residual_domain.get_volume(), - input, - added_output, - output, - residual, - attn_bias, - gamma, - beta); + m, bc, input, attn_bias, residual, added_output, output, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -729,13 +807,299 @@ void AddBiasResidualLayerNorm::inference_task( weights_accessors.push_back(beta); } } + AddBiasResidualLayerNorm::save_inference_tensors_to_file( + m, shard_id, bc, {residual}, weights_accessors, {added_output, output}); + } +} + +void AddBiasResidualLayerNorm::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // added output + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // attn bias + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + // gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_bias) { + // beta_grad + launcher.add_region_requirement( + RegionRequirement(weights[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + } + runtime->execute_index_space(ctx, launcher); +} + +void AddBiasResidualLayerNorm::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + AddBiasResidualLayerNormMeta *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + assert(regions.size() == + 5 + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR added_output = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW attn_bias_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + gamma_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + if (m->use_bias) { + beta_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + } + AddBiasResidualLayerNorm::backward_kernel_wrapper(m, + output_grad, + added_output, + input_grad, + residual_grad, + attn_bias_grad, + gamma, + gamma_grad, + beta_grad); +} + +Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int field_id = 0; + // output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region)); + launcher.add_field(field_id++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +void AddBiasResidualLayerNorm::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(task->regions.size() == regions.size()); + AddBiasResidualLayerNormMeta *m = + *((AddBiasResidualLayerNormMeta **)task->local_args); + assert(regions.size() == 3 + m->elementwise_affine); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, output_grad, input_grad, residual_grad, gamma); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector weights_accessors; + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + } AddBiasResidualLayerNorm::save_inference_tensors_to_file( m, shard_id, bc, - {input, residual}, + {input_grad, residual_grad}, weights_accessors, - {added_output, output}); + {output_grad}, + false /*fwd_pass*/); } } @@ -755,6 +1119,7 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->elementwise_affine); sez.serialize(this->eps); sez.serialize(this->use_bias); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -771,6 +1136,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; float eps; + bool inplace_residual; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); @@ -785,6 +1151,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(elementwise_affine); dez.deserialize(eps); dez.deserialize(use_bias); + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -796,6 +1163,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff, params.elementwise_affine = elementwise_affine; params.eps = eps; params.use_bias = use_bias; + params.inplace_residual = inplace_residual; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); @@ -816,6 +1184,7 @@ size_t hash::operator()( } hash_combine(key, params.elementwise_affine); hash_combine(key, params.use_bias); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index 1add43ecd9..681f55c998 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -23,12 +23,13 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( FFHandler handle, AddBiasResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; @@ -45,6 +46,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) { @@ -75,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -84,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -94,53 +94,36 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { } template -__global__ void LayerNormFusedForwardKernel(int attn_bias_dim, - int residual_volume, - int64_t effective_num_elements, - int64_t effective_batch_size, +__global__ void LayerNormFusedForwardKernel(int64_t N, + int64_t attn_bias_dim, float eps, T const *input_ptr, T const *attn_bias_ptr, T const *residual_ptr, - T *added_output_ptr, - T *output_ptr, - T const *gamma_ptr, - T const *beta_ptr, + T *X, T *mean, - T *rstd) { - // Add attention bias and residual - CUDA_KERNEL_LOOP(i, residual_volume) { - int bias_idx = i % attn_bias_dim; - added_output_ptr[i] = - input_ptr[i] + attn_bias_ptr[bias_idx] + residual_ptr[i]; - } - - __syncthreads(); - - // LayerNorm + T *rstd, + T const *gamma, + T const *beta, + T *Y) { __shared__ float m_shared[C10_WARP_SIZE]; __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; - if (i >= effective_batch_size) { - return; - } float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < effective_num_elements; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { - const int64_t index = i * effective_num_elements + j; - sum1 += static_cast(added_output_ptr[index]); - sum2 += static_cast(added_output_ptr[index]) * - static_cast(added_output_ptr[index]); - } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const int64_t bias_idx = index % attn_bias_dim; + X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index]; + sum1 += static_cast(X[index]); + sum2 += static_cast(X[index]) * static_cast(X[index]); } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { - float const scale = float(1) / static_cast(effective_num_elements); + float const scale = float(1) / static_cast(N); sum1 *= scale; sum2 = max(sum2 * scale - sum1 * sum1, float(0)); mean[i] = static_cast(sum1); @@ -150,17 +133,15 @@ __global__ void LayerNormFusedForwardKernel(int attn_bias_dim, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < effective_num_elements; - j += min(blockDim.x, kCUDANumThreads)) { - const int64_t index = i * effective_num_elements + j; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; const T_ACC gamma_v = - gamma_ptr == nullptr ? T_ACC(1) : static_cast(gamma_ptr[j]); + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); const T_ACC beta_v = - beta_ptr == nullptr ? T_ACC(0) : static_cast(beta_ptr[j]); - output_ptr[index] = (static_cast(added_output_ptr[index]) - - static_cast(mean[i])) * - static_cast(rstd[i]) * gamma_v + - beta_v; + beta == nullptr ? T_ACC(0) : static_cast(beta[j]); + Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * + static_cast(rstd[i]) * gamma_v + + beta_v; } } @@ -178,57 +159,108 @@ void AddBiasResidualLayerNorm::inference_kernel( T const *gamma_ptr, T const *beta_ptr, hipStream_t stream) { - - std::pair kernel1_parallelism = std::make_pair( - GET_BLOCKS(residual_volume), std::min(residual_volume, CUDA_NUM_THREADS)); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel3_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = std::max({kernel1_parallelism.first, - kernel2_parallelism.first, - kernel3_parallelism.first}); - int num_threads = std::max({kernel1_parallelism.second, - kernel2_parallelism.second, - kernel3_parallelism.second}); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormFusedForwardKernel), - num_blocks, - num_threads, + m->effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), 0, stream, - attn_bias_dim, - residual_volume, m->effective_num_elements, - m->effective_batch_size, + attn_bias_dim, m->eps, input_ptr, attn_bias_ptr, residual_ptr, added_output_ptr, - output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), gamma_ptr, beta_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr)); + output_ptr); } /*static*/ void AddBiasResidualLayerNorm::inference_kernel_wrapper( - AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + // inference kernel + int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1; + int residual_volume = residual.domain.get_volume(); if (m->input_type[0] == DT_FLOAT) { AddBiasResidualLayerNorm::inference_kernel( m, @@ -239,8 +271,8 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( residual.get_float_ptr(), added_output.get_float_ptr(), output.get_float_ptr(), - gamma.get_float_ptr(), - m->use_bias ? beta.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, stream); } else if (m->input_type[0] == DT_HALF) { AddBiasResidualLayerNorm::inference_kernel( @@ -252,12 +284,566 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( residual.get_half_ptr(), added_output.get_half_ptr(), output.get_half_ptr(), - gamma.get_half_ptr(), - m->use_bias ? beta.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, stream); } else { assert(false && "unsupport datatype in layernorm"); } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[AddBiasResidualLayerNorm] forward time (CF) = %.9fms\n", elapsed); + // if (m->input_type[0] == DT_FLOAT) { + // print_tensor(input.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(attn_bias.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:attn_bias]"); + // print_tensor(residual.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:residual]"); + // print_tensor(added_output.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:added_output]"); + // print_tensor(output.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + // print_tensor(gamma.get_float_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:gamma]"); + // print_tensor( + // beta.get_float_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:beta]"); + // } else { + // print_tensor( + // input.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(attn_bias.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:attn_bias]"); + // print_tensor(residual.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:residual]"); + // print_tensor(added_output.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:added_output]"); + // print_tensor(output.get_half_ptr(), + // 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + // print_tensor( + // gamma.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:gamma]"); + // print_tensor( + // beta.get_half_ptr(), 32, + // "[AddBiasResidualLayerNorm:forward:beta]"); + // } + // print_tensor(in_ptr, 32, "[AddBiasResidualLayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, + // "[AddBiasResidualLayerNorm:forward:output]"); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual_i = dX_residual + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad) { + dX_residual_i[l] = f_grad_input; + } else { + dX_residual_i[l] += f_grad_input; + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual, + reset_input_grad, + reset_residual_grad, + N, + buf); +} + +/*static*/ +template +void AddBiasResidualLayerNorm::backward_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), + B, + dim3(kThreadX, kThreadY), + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void AddBiasResidualLayerNorm::backward_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::peft_bwd_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T const *gamma_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); +} + +/*static*/ +void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorR const &gamma) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } } }; // namespace FlexFlow diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index ceb1a6514e..bcca1ba2c6 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -22,12 +22,13 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( FFHandler handle, AddBiasResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; @@ -44,6 +45,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) { @@ -74,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -83,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -110,20 +110,17 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const int64_t bias_idx = index % attn_bias_dim; X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index]; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -135,7 +132,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -161,42 +158,33 @@ void AddBiasResidualLayerNorm::inference_kernel( T const *gamma_ptr, T const *beta_ptr, cudaStream_t stream) { - - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - LayerNormFusedForwardKernel - <<>>(m->effective_num_elements, - attn_bias_dim, - m->eps, - input_ptr, - attn_bias_ptr, - residual_ptr, - added_output_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - output_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + attn_bias_dim, + m->eps, + input_ptr, + attn_bias_ptr, + residual_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); } /*static*/ void AddBiasResidualLayerNorm::inference_kernel_wrapper( - AddBiasResidualLayerNormMeta const *m, - int attn_bias_dim, - int residual_volume, + AddBiasResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, + GenericTensorAccessorR const &attn_bias, + GenericTensorAccessorR const &residual, GenericTensorAccessorW &added_output, GenericTensorAccessorW &output, - GenericTensorAccessorR const &residual, - GenericTensorAccessorR const &attn_bias, GenericTensorAccessorR const &gamma, GenericTensorAccessorR const &beta) { cudaStream_t stream; @@ -208,6 +196,69 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + // inference kernel + int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1; + int residual_volume = residual.domain.get_volume(); if (m->input_type[0] == DT_FLOAT) { AddBiasResidualLayerNorm::inference_kernel( m, @@ -297,4 +348,478 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual_i = dX_residual + i1 * N; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad) { + dX_residual_i[l] = f_grad_input; + } else { + dX_residual_i[l] += f_grad_input; + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual, + bool reset_input_grad, + bool reset_residual_grad, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual, + reset_input_grad, + reset_residual_grad, + N, + buf); +} + +/*static*/ +template +void AddBiasResidualLayerNorm::backward_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T *attn_bias_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void AddBiasResidualLayerNorm::backward_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR &added_output, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorW const &attn_bias_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + attn_bias_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + AddBiasResidualLayerNorm::backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + attn_bias_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void AddBiasResidualLayerNorm::peft_bwd_kernel( + AddBiasResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + N); +} + +/*static*/ +void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + AddBiasResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW &input_grad, + GenericTensorAccessorW const &residual_grad, + GenericTensorAccessorR const &gamma) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual_grad.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual_grad.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 5f05458e34..c83b738a0e 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -85,7 +85,7 @@ AggregateParams Aggregate::get_params() const { AggregateParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -242,7 +242,7 @@ OpMeta *Aggregate::init_task(Task const *task, Runtime *runtime) { Aggregate *agg = (Aggregate *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - AggregateMeta *m = new AggregateMeta(handle, agg->n); + AggregateMeta *m = new AggregateMeta(handle, agg); m->profiling = agg->profiling; m->inference_debugging = agg->inference_debugging; std::strcpy(m->op_name, agg->name); @@ -603,7 +603,7 @@ bool Aggregate::measure_operator_cost(Simulator *sim, return false; } - AggregateMeta *m = new AggregateMeta(sim->handler, n); + AggregateMeta *m = new AggregateMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/aggregate.cpp b/src/ops/aggregate.cpp index d5ebdb0c22..5a508cfac4 100644 --- a/src/ops/aggregate.cpp +++ b/src/ops/aggregate.cpp @@ -281,13 +281,14 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, out_dim); } -AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) { - checkCUDA(hipMalloc(&dev_exp_preds, n * sizeof(float *))); - checkCUDA(hipMalloc(&dev_exp_grads, n * sizeof(float *))); +AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(hipMalloc(&dev_exp_preds, aggr->n * sizeof(float *))); + checkCUDA(hipMalloc(&dev_exp_grads, aggr->n * sizeof(float *))); } AggregateMeta::~AggregateMeta(void) { checkCUDA(hipFree(&dev_exp_preds)); checkCUDA(hipFree(&dev_exp_grads)); } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/aggregate.cu b/src/ops/aggregate.cu index 38e141b252..9704302092 100644 --- a/src/ops/aggregate.cu +++ b/src/ops/aggregate.cu @@ -307,9 +307,10 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, } } -AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) { - checkCUDA(cudaMalloc(&dev_exp_preds, n * sizeof(float *))); - checkCUDA(cudaMalloc(&dev_exp_grads, n * sizeof(float *))); +AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(cudaMalloc(&dev_exp_preds, aggr->n * sizeof(float *))); + checkCUDA(cudaMalloc(&dev_exp_grads, aggr->n * sizeof(float *))); } AggregateMeta::~AggregateMeta(void) { checkCUDA(cudaFree(&dev_exp_preds)); diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc index 1edd430881..6ea3ff3747 100644 --- a/src/ops/aggregate_spec.cc +++ b/src/ops/aggregate_spec.cc @@ -84,7 +84,7 @@ AggregateSpecParams AggregateSpec::get_params() const { AggregateSpecParams params; params.n = this->n; params.lambda_bal = this->lambda_bal; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -210,7 +210,7 @@ OpMeta *AggregateSpec::init_task(Task const *task, Runtime *runtime) { AggregateSpec *agg = (AggregateSpec *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg->n); + AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg); m->profiling = agg->profiling; m->inference_debugging = agg->inference_debugging; std::strcpy(m->op_name, agg->name); @@ -543,7 +543,7 @@ bool AggregateSpec::measure_operator_cost(Simulator *sim, return false; } - AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, n); + AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/aggregate_spec.cpp b/src/ops/aggregate_spec.cpp index 314e20a59c..a676fa81c3 100644 --- a/src/ops/aggregate_spec.cpp +++ b/src/ops/aggregate_spec.cpp @@ -290,9 +290,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m, out_dim); } -AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n) - : OpMeta(handler) { - checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *))); +AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, + AggregateSpec const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(hipMalloc(&dev_region_ptrs, aggr->n * sizeof(float *))); } AggregateSpecMeta::~AggregateSpecMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); diff --git a/src/ops/aggregate_spec.cu b/src/ops/aggregate_spec.cu index 8d50d45d21..ac5a372efc 100644 --- a/src/ops/aggregate_spec.cu +++ b/src/ops/aggregate_spec.cu @@ -287,9 +287,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m, out_dim); } -AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n) - : OpMeta(handler) { - checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *))); +AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, + AggregateSpec const *aggr) + : OpMeta(handler, aggr) { + checkCUDA(cudaMalloc(&dev_region_ptrs, aggr->n * sizeof(float *))); } AggregateSpecMeta::~AggregateSpecMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc index 780a77450e..534bac2419 100644 --- a/src/ops/arg_topk.cc +++ b/src/ops/arg_topk.cc @@ -112,7 +112,7 @@ ArgTopKParams ArgTopK::get_params() const { params.k = this->k; params.sorted = this->sorted; params.speculative_decoding = this->speculative_decoding; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -387,7 +387,7 @@ InferenceResult DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW probs; - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); ArgTopK::forward_kernel_wrapper( m, input, probs, indices, batch_size, nullptr); @@ -399,7 +399,7 @@ InferenceResult } InferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } @@ -431,9 +431,10 @@ BeamInferenceResult ArgTopK::inference_speculative_task( ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc); BeamInferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size * m->k); - download_tensor(probs.get_float_ptr(), ir.probs, batch_size * m->k); + copy_tensor_dev_to_host( + probs.get_float_ptr(), ir.probs, batch_size * m->k); return ir; } diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc index 1892ac2353..4123e50e7e 100644 --- a/src/ops/argmax.cc +++ b/src/ops/argmax.cc @@ -91,7 +91,7 @@ Op *ArgMax::create_operator_from_layer( ArgMaxParams ArgMax::get_params() const { ArgMaxParams params; params.beam_search = this->beam_search; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -314,7 +314,7 @@ FutureMap ArgMax::inference(FFModel const &ff, launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); @@ -348,15 +348,18 @@ BeamInferenceResult m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO( DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime); - ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); + float loss = 0.0f; + ArgMax::forward_kernel_wrapper( + m, bc, input, indices, parent, batch_size, &loss); BeamInferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); - download_tensor(m->probs, ir.probs, batch_size); - download_tensor(parent.get_int32_ptr(), ir.parent_id, batch_size); + copy_tensor_dev_to_host(m->probs, ir.probs, batch_size); + copy_tensor_dev_to_host( + parent.get_int32_ptr(), ir.parent_id, batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -383,23 +386,36 @@ InferenceResult return ir; } - GenericTensorAccessorW input = helperGetGenericTensorAccessorRW( + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW parent; - int batch_size = bc->num_active_tokens(); - ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size); + int batch_size = bc->num_active_infr_tokens(); + float loss = 0.0f; + + ArgMax::forward_kernel_wrapper( + m, bc, input, indices, parent, batch_size, &loss); + InferenceResult ir; + ir.finetuning_loss = loss; + + if (bc->num_active_peft_tokens() > 0) { + printf("Loss: %.4f\n", loss); + } + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; ArgMax::save_inference_tensors_to_file( - m, shard_id, bc, {}, {}, {input, indices}); + m, shard_id, bc, {input}, {}, {indices}); + } else { + m->decoding_step++; } - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); + return ir; } @@ -453,4 +469,4 @@ size_t hash::operator()( hash_combine(key, params.beam_search); return key; } -}; // namespace std \ No newline at end of file +}; // namespace std diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 8a1cf0b3b0..60d44cdf2b 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -334,6 +334,21 @@ __device__ void mergeShards(int num_shards, } } +template +__global__ void compute_sparse_categorical_crossentropy_loss( + DT const *logits, + BatchConfig::TokenId const *labels, + float *loss, + int num_tokens, + int num_classes) { + float const LOG_MIN_VALUE = 0.00000001f; + CUDA_KERNEL_LOOP(b, num_tokens) { + float my_logit = + max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(loss, -log(my_logit)); + } +} + template __global__ void argmax_forward_kernel(T const *__restrict__ input, size_t shared_memory_size, @@ -381,14 +396,16 @@ __global__ void copy_result(hipcub::KeyValuePair *d_out, /*static*/ template void ArgMax::forward_kernel(ArgMaxMeta const *m, - DT *input_ptr, + BatchConfig const *bc, + DT const *input_ptr, int *indices_ptr, float *prob_ptr, int *parent, int const length, int const batch_size, + float *loss, hipStream_t stream) { - checkCUDA(get_legion_stream(&stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); if (m->beam_search) { @@ -425,28 +442,77 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, k, prob_ptr, indices_ptr); + + // compute cross-entropy loss if there is a finetuning request + assert(loss != nullptr); + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int num_finetuning_requests = 0, num_bwd_tokens = 0; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_bwd) { + assert(num_finetuning_requests == 0 && num_bwd_tokens == 0); + num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = + bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + num_finetuning_requests += 1; + } else { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + } + } + assert(num_finetuning_requests <= 1); + if (num_bwd_tokens > 0) { + checkCUDA(hipMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + hipMemcpyHostToDevice, + stream)); + // copy loss to d_loss + checkCUDA(hipMemsetAsync(m->d_loss, 0, sizeof(float), stream)); + compute_sparse_categorical_crossentropy_loss<<>>( + input_ptr, + static_cast(m->handle.workSpace), + m->d_loss, + num_bwd_tokens, + length); + // copy value from d_loss to loss + checkCUDA(hipMemcpyAsync( + loss, m->d_loss, sizeof(float), hipMemcpyDeviceToHost, stream)); + *loss = *loss / (float)num_bwd_tokens; + } } /*static*/ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, - GenericTensorAccessorW const &input, + BatchConfig const *bc, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, - int batch_size) { + int batch_size, + float *loss) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } - int length = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (input.data_type == DT_HALF) { ArgMax::forward_kernel(m, + bc, input.get_half_ptr(), indices.get_int32_ptr(), m->probs, @@ -454,10 +520,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else if (input.data_type == DT_FLOAT) { ArgMax::forward_kernel(m, + bc, input.get_float_ptr(), indices.get_int32_ptr(), m->probs, @@ -465,6 +533,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else { assert(false && "Unsupported data type"); diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 05c84719c1..8a2e2da2d0 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -44,19 +44,35 @@ __global__ void copy_result(cub::KeyValuePair *d_out, } } +template +__global__ void compute_sparse_categorical_crossentropy_loss( + DT const *logits, + BatchConfig::TokenId const *labels, + float *loss, + int num_tokens, + int num_classes) { + float const LOG_MIN_VALUE = 0.00000001f; + CUDA_KERNEL_LOOP(b, num_tokens) { + float my_logit = + max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE); + atomicAdd(loss, -log(my_logit)); + } +} + /*static*/ template void ArgMax::forward_kernel(ArgMaxMeta const *m, - DT *input_ptr, + BatchConfig const *bc, + DT const *input_ptr, int *indices_ptr, float *prob_ptr, int *parent, int const length, int const batch_size, + float *loss, cudaStream_t stream) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; + if (m->beam_search) { // set all parents id zero in arg top1 case. checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream)); @@ -73,7 +89,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m, m->d_offsets + 1, stream)); - // copy dout to incides + // copy dout to indices int parallelism = batch_size; copy_result<<beam_search); // print_tensor(indices_ptr, 32, "argmax op"); + + // compute cross-entropy loss if there is a finetuning request + assert(loss != nullptr); + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int num_finetuning_requests = 0, num_bwd_tokens = 0; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_bwd) { + assert(num_finetuning_requests == 0 && num_bwd_tokens == 0); + num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = + bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + num_finetuning_requests += 1; + } else { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + } + } + assert(num_finetuning_requests <= 1); + if (num_bwd_tokens > 0) { + checkCUDA(cudaMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + cudaMemcpyHostToDevice, + stream)); + // copy loss to d_loss + checkCUDA(cudaMemsetAsync(m->d_loss, 0, sizeof(float), stream)); + compute_sparse_categorical_crossentropy_loss<<>>( + input_ptr, + static_cast(m->handle.workSpace), + m->d_loss, + num_bwd_tokens, + length); + // copy value from d_loss to loss + checkCUDA(cudaMemcpyAsync( + loss, m->d_loss, sizeof(float), cudaMemcpyDeviceToHost, stream)); + *loss = *loss / (float)num_bwd_tokens; + } } /*static*/ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, - GenericTensorAccessorW const &input, + BatchConfig const *bc, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &indices, GenericTensorAccessorW const &parent, - int batch_size) { + int batch_size, + float *loss) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -104,6 +170,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, if (input.data_type == DT_HALF) { ArgMax::forward_kernel(m, + bc, input.get_half_ptr(), indices.get_int32_ptr(), m->probs, @@ -111,10 +178,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else if (input.data_type == DT_FLOAT) { ArgMax::forward_kernel(m, + bc, input.get_float_ptr(), indices.get_int32_ptr(), m->probs, @@ -122,6 +191,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m, : nullptr, length, batch_size, + loss, stream); } else { assert(false && "Unsupported data type"); @@ -202,6 +272,10 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler, gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes); d_temp_storage = gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes); + + // allocate space for loss on device + gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(float)); + d_loss = gpu_mem_allocator.allocate_instance(1); } ArgMaxMeta::~ArgMaxMeta(void) { diff --git a/src/ops/attention.cc b/src/ops/attention.cc index 203662d3ec..aef4f0a16a 100644 --- a/src/ops/attention.cc +++ b/src/ops/attention.cc @@ -1010,7 +1010,7 @@ MultiHeadAttentionParams MultiHeadAttention::get_params() const { params.bias = this->bias; params.add_bias_kv = this->add_bias_kv; params.add_zero_attn = this->add_zero_attn; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/attention.cpp b/src/ops/attention.cpp index ee7f87a7fb..10655a4a1a 100644 --- a/src/ops/attention.cpp +++ b/src/ops/attention.cpp @@ -156,7 +156,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, Memory gpu_mem, int num_samples, int num_heads) - : OpMeta(handler) { + : OpMeta(handler, attn) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); diff --git a/src/ops/attention.cu b/src/ops/attention.cu index 18fc810aed..4c460cdbbf 100644 --- a/src/ops/attention.cu +++ b/src/ops/attention.cu @@ -194,7 +194,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler, Memory gpu_mem, int num_samples, int num_heads) - : OpMeta(handler) { + : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc index e13169f6c1..e5f0611fb0 100644 --- a/src/ops/batch_matmul.cc +++ b/src/ops/batch_matmul.cc @@ -279,7 +279,7 @@ OpMeta *BatchMatmul::init_task(Task const *task, Runtime *runtime) { BatchMatmul const *bmm = (BatchMatmul *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - BatchMatmulMeta *m = new BatchMatmulMeta(handle); + BatchMatmulMeta *m = new BatchMatmulMeta(handle, bmm); m->profiling = bmm->profiling; m->inference_debugging = bmm->inference_debugging; m->a_seq_length_dim = bmm->a_seq_length_dim; @@ -616,7 +616,7 @@ bool BatchMatmul::measure_operator_cost(Simulator *sim, batch *= sub_input0.dims[i].size; } - BatchMatmulMeta *meta = sim->batch_matmul_meta; + BatchMatmulMeta *meta = new BatchMatmulMeta(sim->handler, this); // allocate tensors in simulator sim->free_all(); diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp index 7dee6fdaaf..5856f1dddf 100644 --- a/src/ops/batch_norm.cpp +++ b/src/ops/batch_norm.cpp @@ -284,7 +284,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, int output_c, int output_h, int output_w) - : OpMeta(handler) { + : OpMeta(handler, bn) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu index 929ebf81f8..01e993067a 100644 --- a/src/ops/batch_norm.cu +++ b/src/ops/batch_norm.cu @@ -270,7 +270,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler, int output_c, int output_h, int output_w) - : OpMeta(handler) { + : OpMeta(handler, bn) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc index 5f4547ace5..36cc7fd8fa 100644 --- a/src/ops/beam_topk.cc +++ b/src/ops/beam_topk.cc @@ -375,7 +375,7 @@ BeamInferenceResult // embedding size: eg. 4096 int length = input_domain.hi()[0] - input_domain.lo()[0] + 1; // total token nums - size_t batch_size = bc.num_active_tokens(); + size_t batch_size = bc.num_active_infr_tokens(); // need meta for: how many sub requests in a main request BeamTopK::forward_kernel_wrapper(m, @@ -390,9 +390,11 @@ BeamInferenceResult BeamInferenceResult ir; - download_tensor(index_ptr, ir.token_ids, batch_size * m->max_beam_width); - download_tensor(value_ptr, ir.probs, batch_size * m->max_beam_width); - download_tensor( + copy_tensor_dev_to_host( + index_ptr, ir.token_ids, batch_size * m->max_beam_width); + copy_tensor_dev_to_host( + value_ptr, ir.probs, batch_size * m->max_beam_width); + copy_tensor_dev_to_host( parent_ptr, ir.parent_id, batch_size * m->max_beam_width); if (m->inference_debugging) { diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp index 8545bea7cb..5d80707ea7 100644 --- a/src/ops/beam_topk.cpp +++ b/src/ops/beam_topk.cpp @@ -681,7 +681,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { + : OpMeta(handler, op) { DataType data_type = op->inputs[0]->data_type; int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); int max_requests_per_batch = BatchConfig::max_requests_per_batch(); diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index c24bdf7c74..bf4c23cad0 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -723,7 +723,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m, BeamTopKMeta::BeamTopKMeta(FFHandler handler, Op const *op, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handler) { + : OpMeta(handler, op) { DataType data_type = op->inputs[0]->data_type; int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); int max_requests_per_batch = BatchConfig::max_requests_per_batch(); diff --git a/src/ops/cache.cc b/src/ops/cache.cc index 691e45b559..33b862ae85 100644 --- a/src/ops/cache.cc +++ b/src/ops/cache.cc @@ -165,7 +165,7 @@ OpMeta *Cache::init_task(Task const *task, Runtime *runtime) { Cache *c = (Cache *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - CacheMeta *m = new CacheMeta(handle); + CacheMeta *m = new CacheMeta(handle, c); m->cache_score = 0.0f; m->profiling = c->profiling; m->inference_debugging = c->inference_debugging; diff --git a/src/ops/cache.cpp b/src/ops/cache.cpp index 95c5995f9e..a9512c2c59 100644 --- a/src/ops/cache.cpp +++ b/src/ops/cache.cpp @@ -75,7 +75,7 @@ float Cache::cache_update(Task const *task, return cache_score; } -CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {} +CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {} template void Cache::cache_forward(Task const *task, diff --git a/src/ops/cache.cu b/src/ops/cache.cu index a113e57a1c..2f95e59669 100644 --- a/src/ops/cache.cu +++ b/src/ops/cache.cu @@ -74,7 +74,7 @@ float Cache::cache_update(Task const *task, return cache_score; } -CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {} +CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {} template void Cache::cache_forward(Task const *task, diff --git a/src/ops/cast.cc b/src/ops/cast.cc index e514236a31..4a52bf874e 100644 --- a/src/ops/cast.cc +++ b/src/ops/cast.cc @@ -190,7 +190,7 @@ OpMeta *Cast::init_task(Task const *task, Runtime *runtime) { Cast *cast = (Cast *)task->args; FFHandler handler = *((FFHandler const *)task->local_args); - CastMeta *m = new CastMeta(handler); + CastMeta *m = new CastMeta(handler, cast); m->input_data_type = cast->inputs[0]->data_type; m->output_data_type = cast->outputs[0]->data_type; std::strcpy(m->op_name, cast->name); diff --git a/src/ops/concat.cc b/src/ops/concat.cc index d4d8e525fc..0a82779b6d 100644 --- a/src/ops/concat.cc +++ b/src/ops/concat.cc @@ -197,7 +197,7 @@ OpMeta *Concat::init_task(Task const *task, Runtime *runtime) { Concat *cc = (Concat *)task->args; FFHandler handler = *((FFHandler const *)task->local_args); - ConcatMeta *m = new ConcatMeta(handler); + ConcatMeta *m = new ConcatMeta(handler, cc); // Note that our internal axis index ordering is opposite to other frameworks init_meta(m, cc->legion_axis); m->profiling = cc->profiling; @@ -365,7 +365,7 @@ bool Concat::measure_operator_cost(Simulator *sim, } } - ConcatMeta *m = sim->concat_meta; + ConcatMeta *m = new ConcatMeta(sim->handler, this); init_meta(m, this->legion_axis); sim->free_all(); diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc index 94850a178d..2428c9b99a 100644 --- a/src/ops/conv_2d.cc +++ b/src/ops/conv_2d.cc @@ -588,12 +588,13 @@ OpMeta *Conv2D::init_task(Task const *task, // regions[4], task->regions[4], FID_DATA, ctx, runtime, // false/*readOutput*/); - Conv2DMeta *m = new Conv2DMeta(handle); + Conv2DMeta *m = new Conv2DMeta(handle, conv); m->relu = conv->activation == AC_MODE_RELU; m->use_bias = conv->use_bias; m->profiling = conv->profiling; m->inference_debugging = conv->inference_debugging; - m->trainableInputs[0] = conv->trainableInputs[0]; + m->trainable_inputs[0] = conv->trainable_inputs[0]; + m->reset_input_grads[0] = conv->trainable_inputs[0]; std::strcpy(m->op_name, conv->name); m->layer_guid = conv->layer_guid; @@ -753,7 +754,7 @@ void Conv2D::backward(FFModel const &ff) { inputs[0]->region)); launcher.add_field(rid++, FID_DATA); // regions[1](I/O): input_grad - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -803,7 +804,7 @@ void Conv2D::backward(FFModel const &ff) { /* region(I): input - region(I/O): input_grad (if trainableInputs[0]) + region(I/O): input_grad (if trainable_inputs[0]) region(I): output region(I/O): output_grad region(I): filter @@ -816,17 +817,17 @@ void Conv2D::backward_task(Task const *task, Runtime *runtime) { // Conv2D* conv = (Conv2D*) task->args; Conv2DMeta const *m = *((Conv2DMeta **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + + assert(regions.size() == (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + + (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); size_t rid = 0; TensorAccessorR acc_input( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; float *acc_input_grad_ptr = NULL; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { TensorAccessorW acc_input_grad( regions[rid], task->regions[rid], @@ -1119,7 +1120,7 @@ bool Conv2D::measure_operator_cost(Simulator *sim, int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; - Conv2DMeta *m = sim->conv2d_meta; + Conv2DMeta *m = new Conv2DMeta(sim->handler, this); m->relu = activation == AC_MODE_RELU; // require input_c is divisible by groups diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc index 4352f459b9..cf8696182b 100644 --- a/src/ops/element_binary.cc +++ b/src/ops/element_binary.cc @@ -429,7 +429,7 @@ OpMeta *ElementBinary::init_task(Task const *task, FFHandler handle = *((FFHandler *)task->local_args); ElementBinaryMeta *m = new ElementBinaryMeta(handle, eb); for (int i = 0; i < eb->numInputs; i++) { - m->trainableInputs[i] = eb->trainableInputs[i]; + m->trainable_inputs[i] = eb->trainable_inputs[i]; } m->op_type = eb->op_type; m->profiling = eb->profiling; @@ -892,7 +892,7 @@ void ElementBinary::backward(FFModel const &ff) { inputs[0]->region)); launcher.add_field(rid++, FID_DATA); // regions[2](I/O): input0_grad - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement( RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, @@ -910,7 +910,7 @@ void ElementBinary::backward(FFModel const &ff) { inputs[1]->region)); launcher.add_field(rid++, FID_DATA); // regions[4](I/O): input1_grad - if (trainableInputs[1]) { + if (trainable_inputs[1]) { launcher.add_region_requirement( RegionRequirement(inputs[1]->part_grad, 0 /*projection id*/, @@ -980,7 +980,7 @@ void ElementBinary::backward_task(Task const *task, in0_ptr = helperGetTensorPointerRO( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { Domain in0_grad_domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); assert(in0_domain == in0_grad_domain); @@ -998,7 +998,7 @@ void ElementBinary::backward_task(Task const *task, in1_ptr = helperGetTensorPointerRO( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[1]) { + if (m->trainable_inputs[1]) { Domain in1_grad_domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); // assert(out_grad_domain == in1_domain); diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc index 0e1d115557..09cf13c717 100644 --- a/src/ops/element_unary.cc +++ b/src/ops/element_unary.cc @@ -354,7 +354,7 @@ OpMeta *ElementUnary::init_task(Task const *task, Runtime *runtime) { ElementUnary *eu = (ElementUnary *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - ElementUnaryMeta *m = new ElementUnaryMeta(handle); + ElementUnaryMeta *m = new ElementUnaryMeta(handle, eu); m->op_type = eu->op_type; m->data_type = eu->outputs[0]->data_type; // Input and output should have the same data type @@ -737,7 +737,7 @@ bool ElementUnary::measure_operator_cost(Simulator *sim, if (!inputs[0]->get_sub_tensor(mv, sub_input)) { return false; } - ElementUnaryMeta *m = sim->ele_unary_meta; + ElementUnaryMeta *m = new ElementUnaryMeta(sim->handler, this); m->op_type = op_type; if (use_cudnn(m->op_type)) { Domain input_domain, output_domain; diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp index e20200420f..435abdfe11 100644 --- a/src/ops/element_unary.cpp +++ b/src/ops/element_unary.cpp @@ -282,7 +282,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, stream); } -ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { +ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary) + : OpMeta(handler, unary) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu index c7f5e90f4c..15e6852388 100644 --- a/src/ops/element_unary.cu +++ b/src/ops/element_unary.cu @@ -291,7 +291,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m, stream); } -ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) { +ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary) + : OpMeta(handler, unary) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc index e630563b63..95b538bdb6 100644 --- a/src/ops/embedding.cc +++ b/src/ops/embedding.cc @@ -469,7 +469,7 @@ FutureMap Embedding::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); size_t machine_view_hash = view->hash(); - IndexLauncher launcher(EMBED_FWD_TASK_ID, + IndexLauncher launcher(EMBED_INF_TASK_ID, parallel_is, TaskArgument(NULL, 0), argmap, @@ -559,12 +559,6 @@ void Embedding::forward_task(Task const *task, } forward_kernel_wrapper( m, input, output, kernel, in_dim, out_dim, effective_batch_size); - if (m->inference_debugging) { - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - Embedding::save_inference_tensors_to_file( - m, shard_id, nullptr, {input}, {kernel}, {output}); - } } /* @@ -672,6 +666,16 @@ void Embedding::backward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +Legion::FutureMap + Embedding::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + // nothing to do (backward function only updates weights) + return FutureMap(); +} + void Embedding::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/ops/experts.cc b/src/ops/experts.cc index 8c66f9c7bc..3acc68ed9b 100644 --- a/src/ops/experts.cc +++ b/src/ops/experts.cc @@ -589,18 +589,7 @@ OpMeta *Experts::init_task(Task const *task, Runtime *runtime) { Experts const *exp = (Experts *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ExpertsMeta *m = new ExpertsMeta(handle, - exp->num_experts, - exp->experts_start_idx, - exp->data_dim, - exp->out_dim, - exp->experts_num_layers, - exp->experts_internal_dim_size, - exp->effective_batch_size, - exp->num_chosen_experts, - exp->alpha, - exp->use_bias, - exp->activation); + ExpertsMeta *m = new ExpertsMeta(handle, exp); m->profiling = exp->profiling; m->inference_debugging = exp->inference_debugging; std::strcpy(m->op_name, exp->name); @@ -682,7 +671,7 @@ FutureMap Experts::inference(FFModel const &ff, size_t machine_view_hash = view->hash(); /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv << std::endl; */ - // int num_active_tokens = bc->num_active_tokens(); + // int num_active_infr_tokens = bc->num_active_infr_tokens(); IndexLauncher launcher(EXPERTS_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -1075,7 +1064,7 @@ void Experts::inference_task(Task const *task, output_ptr, weights_ptr, bias_ptr, - bc->num_active_tokens(), + bc->num_active_infr_tokens(), chosen_experts, batch_size, out_dim); diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp index c06f02a647..502be878a9 100644 --- a/src/ops/experts.cpp +++ b/src/ops/experts.cpp @@ -27,7 +27,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim) { @@ -35,25 +35,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, handle_unimplemented_hip_kernel(OP_EXPERTS); } -ExpertsMeta::ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation) - : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), experts_num_layers(_experts_num_layers), - experts_internal_dim_size(_experts_internal_dim_size), - effective_batch_size(_effective_batch_size), - num_chosen_experts(_num_chosen_experts), alpha(_alpha), - use_bias(_use_bias), activation(_activation) {} +ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e) + : OpMeta(handler, e), num_experts(e->num_experts), + experts_start_idx(e->experts_start_idx), data_dim(e->data_dim), + out_dim(e->out_dim), experts_num_layers(e->experts_num_layers), + experts_internal_dim_size(e->experts_internal_dim_size), + effective_batch_size(e->effective_batch_size), + num_chosen_experts(e->num_chosen_experts), alpha(e->alpha), + use_bias(e->use_bias), activation(e->activation) {} + ExpertsMeta::~ExpertsMeta(void) {} }; // namespace FlexFlow diff --git a/src/ops/experts.cu b/src/ops/experts.cu index ce15cdff55..f6f555d1ad 100644 --- a/src/ops/experts.cu +++ b/src/ops/experts.cu @@ -515,7 +515,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, float *output, float const *weights, float const *biases, - int num_active_tokens, + int num_active_infr_tokens, int chosen_experts, int batch_size, int out_dim) { @@ -529,8 +529,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, cudaEventRecord(t_start, stream); } - assert(num_active_tokens > 0); - assert(num_active_tokens <= m->effective_batch_size); + assert(num_active_infr_tokens > 0); + assert(num_active_infr_tokens <= m->effective_batch_size); assert(m->effective_batch_size == batch_size); int num_experts_per_block = m->num_experts; @@ -540,7 +540,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int data_dim = m->data_dim; int num_chosen_experts = m->num_chosen_experts; // int num_tokens = m->effective_batch_size; - int num_tokens = num_active_tokens; + int num_tokens = num_active_infr_tokens; int expert_capacity = m->expert_capacity; assert(chosen_experts == num_chosen_experts); @@ -579,14 +579,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, #ifdef INFERENCE_TESTS // Checking // 1. check that m->sorted_indices contains indices sorted - int *indices_cpu = download_tensor(indices, num_indices); + int *indices_cpu = copy_tensor_dev_to_host(indices, num_indices); // assert(indices_cpu != nullptr); std::vector indices_vec(indices_cpu, indices_cpu + num_indices); std::vector indices_vec_sorted(indices_vec.size()); std::copy(indices_vec.begin(), indices_vec.end(), indices_vec_sorted.begin()); std::stable_sort(indices_vec_sorted.begin(), indices_vec_sorted.end()); - int *thrust_sorted_indices_cpu = download_tensor( + int *thrust_sorted_indices_cpu = copy_tensor_dev_to_host( m->sorted_indices, m->num_chosen_experts * m->effective_batch_size); // assert(thrust_sorted_indices_cpu != nullptr); std::vector thrust_sorted_indices_vec( @@ -613,7 +613,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, assert(indices_vec_sorted[i] == thrust_sorted_indices_vec[i]); } // 2. check that indices[m->original_indices[i]] = i - int *thrust_original_indices_cpu = download_tensor( + int *thrust_original_indices_cpu = copy_tensor_dev_to_host( m->original_indices, m->num_chosen_experts * m->effective_batch_size); // assert(thrust_original_indices_cpu != nullptr); std::vector thrust_original_indices_vec( @@ -668,8 +668,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } assert(non_zero_experts_count == non_zero_experts_check.size()); // 7. check exp_local_label_to_index - int *non_zero_expert_labels_cpu = - download_tensor(m->non_zero_expert_labels, non_zero_experts_count); + int *non_zero_expert_labels_cpu = copy_tensor_dev_to_host( + m->non_zero_expert_labels, non_zero_experts_count); // assert(non_zero_expert_labels_cpu != nullptr); std::vector non_zero_expert_labels_vec(non_zero_expert_labels_cpu, non_zero_expert_labels_cpu + @@ -684,8 +684,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, non_zero_experts_check_vec.end())); assert(non_zero_expert_labels_vec == non_zero_experts_check_vec); - int *exp_local_label_to_index = - download_tensor(m->exp_local_label_to_index, non_zero_experts_count); + int *exp_local_label_to_index = copy_tensor_dev_to_host( + m->exp_local_label_to_index, non_zero_experts_count); // assert(exp_local_label_to_index != nullptr); std::vector exp_local_label_to_index_vec(exp_local_label_to_index, exp_local_label_to_index + @@ -699,8 +699,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } // 8. Check expert_start_indexes - int *expert_start_indices_thrust = - download_tensor(m->expert_start_indexes, non_zero_experts_count + 1); + int *expert_start_indices_thrust = copy_tensor_dev_to_host( + m->expert_start_indexes, non_zero_experts_count + 1); // assert(expert_start_indices_thrust != nullptr); std::vector expert_start_indices_thrust_vec( expert_start_indices_thrust, @@ -746,9 +746,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int *num_assignments_per_expert_thrust = (int *)calloc(non_zero_experts_count, sizeof(int)); assert(num_assignments_per_expert_thrust != nullptr); - assert(download_tensor(m->num_assignments_per_expert, - num_assignments_per_expert_thrust, - non_zero_experts_count)); + assert(copy_tensor_dev_to_host(m->num_assignments_per_expert, + num_assignments_per_expert_thrust, + non_zero_experts_count)); assert(num_assignments_per_expert_thrust != nullptr); std::vector num_assignments_per_expert_thrust_vec( num_assignments_per_expert_thrust, @@ -759,9 +759,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, int *destination_start_indices_thrust = (int *)calloc(non_zero_experts_count, sizeof(int)); assert(destination_start_indices_thrust != nullptr); - assert(download_tensor(m->destination_start_indices, - destination_start_indices_thrust, - non_zero_experts_count)); + assert(copy_tensor_dev_to_host(m->destination_start_indices, + destination_start_indices_thrust, + non_zero_experts_count)); assert(destination_start_indices_thrust != nullptr); std::vector destination_start_indices_thrust_vec( destination_start_indices_thrust, @@ -1233,25 +1233,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m, } } -ExpertsMeta::ExpertsMeta(FFHandler handler, - int _num_experts, - int _experts_start_idx, - int _data_dim, - int _out_dim, - int _experts_num_layers, - int _experts_internal_dim_size, - int _effective_batch_size, - int _num_chosen_experts, - float _alpha, - bool _use_bias, - ActiMode _activation) - : OpMeta(handler), num_experts(_num_experts), - experts_start_idx(_experts_start_idx), data_dim(_data_dim), - out_dim(_out_dim), experts_num_layers(_experts_num_layers), - experts_internal_dim_size(_experts_internal_dim_size), - effective_batch_size(_effective_batch_size), - num_chosen_experts(_num_chosen_experts), alpha(_alpha), - use_bias(_use_bias), activation(_activation) { +ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e) + : OpMeta(handler, e), num_experts(e->num_experts), + experts_start_idx(e->experts_start_idx), data_dim(e->data_dim), + out_dim(e->out_dim), experts_num_layers(e->experts_num_layers), + experts_internal_dim_size(e->experts_internal_dim_size), + effective_batch_size(e->effective_batch_size), + num_chosen_experts(e->num_chosen_experts), alpha(e->alpha), + use_bias(e->use_bias), activation(e->activation) { expert_capacity = ceil(alpha * num_chosen_experts / num_experts * effective_batch_size); diff --git a/src/ops/flat.cc b/src/ops/flat.cc index 80aedbbb31..e9f637294a 100644 --- a/src/ops/flat.cc +++ b/src/ops/flat.cc @@ -187,7 +187,8 @@ OpMeta *Flat::init_task(Task const *task, Context ctx, Runtime *runtime) { FFHandler handler = *((FFHandler const *)task->local_args); - FlatMeta *m = new FlatMeta(handler); + Flat *flat = (Flat *)task->args; + FlatMeta *m = new FlatMeta(handler, flat); return m; } diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 9ad5c4dc9c..121139beb1 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/fused.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/batch_norm.h" @@ -87,12 +88,32 @@ FusedOp::FusedOp(FFModel &model, Op *op) // weights[i]->owner_idx = i; weight_data_types[i] = op->weights[i]->data_type; } - numOutputs = op->numOutputs; - for (int i = 0; i < numOutputs; i++) { - outputs[i] = op->outputs[i]; - outputs[i]->owner_op = this; - outputs[i]->owner_idx = i; - output_data_types[i] = op->outputs[i]->data_type; + numOutputs = 0; + for (int i = 0; i < op->numOutputs; i++) { + bool found = false; + // Handle in-place outputs + for (int j = 0; j < numInputs; j++) { + if (inputs[j]->region == op->outputs[i]->region) { + // This output is one of the inputs + assert(!found); + assert(inputs[j]->region != LogicalRegion::NO_REGION); + op_output_source[i] = SOURCE_INPUT; + op_input_idx[i] = j; + found = true; + break; + } + } + if (found) { + // do nothing + } else { + outputs[numOutputs] = op->outputs[i]; + output_data_types[numOutputs] = op->outputs[i]->data_type; + op_output_source[i] = SOURCE_OUTPUT; + op_output_idx[i] = numOutputs; + outputs[numOutputs]->owner_op = this; + outputs[numOutputs]->owner_idx = numOutputs; + numOutputs++; + } } numOperators = 1; op_num_inputs[0] = op->numInputs; @@ -109,10 +130,53 @@ FusedOp::FusedOp(FFModel &model, Op *op) op_weight_source[i] = SOURCE_WEIGHT; op_weight_idx[i] = i; } - for (int i = 0; i < numOutputs; i++) { - op_output_source[i] = SOURCE_OUTPUT; - op_output_idx[i] = i; - } + // for (int i = 0; i < numOutputs; i++) { + // op_output_source[i] = SOURCE_OUTPUT; + // op_output_idx[i] = i; + // } +#if 0 + int input_offset = 0, weight_offset = 0, output_offset = 0; + printf("\nNew fused op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: " + "#inputs=%i, #outputs=%i, #weights=%i\n", + op->name, + get_operator_type_name(op->op_type).c_str(), + op->numInputs, + op->numOutputs, + op->numWeights, + numInputs, + numOutputs, + numWeights); + printf("op_input_idx:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_idx[i]); + } + printf("\n"); + printf("op_input_source:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_source[i]); + } + printf("\n"); + printf("op_output_idx:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_idx[i]); + } + printf("\n"); + printf("op_output_source:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_source[i]); + } + printf("\n"); + printf("op_weight_idx:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_idx[i]); + } + printf("\n"); + printf("op_weight_source:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_source[i]); + } + printf("\n"); +#endif } bool FusedOp::use_same_regions( @@ -165,7 +229,8 @@ bool FusedOp::add_operator( // op->name, op_config)); // Cannot fuse parallel operators (except allreduce) since they have different // paralel_is in forward and backward - assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE); + assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE || + op->op_type == OP_PARALLEL_IDENTITY); // Currently don't consider nested fusion assert(op->op_type != OP_FUSED); MachineView my_view = outputs[0]->machine_view; @@ -271,6 +336,18 @@ bool FusedOp::add_operator( found = true; op_output_source[output_offset + i] = SOURCE_OUTPUT; op_output_idx[output_offset + i] = j; + break; + } + } + for (int j = 0; j < numInputs; j++) { + if (inputs[j]->region == op->outputs[i]->region) { + // This input is one of my inputs + assert(!found); + assert(inputs[j]->region != LogicalRegion::NO_REGION); + op_output_source[output_offset + i] = SOURCE_INPUT; + op_output_idx[output_offset + i] = j; + found = true; + break; } } if (found) { @@ -311,6 +388,50 @@ bool FusedOp::add_operator( "Reach to the #outputs limit during fusion.\n" "Consider increase MAX_NUM_OUTPUTS to allow more fusions.\n"); } + +#if 0 + printf("\nAdd op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: " + "#inputs=%i, #outputs=%i, #weights=%i\n", + op->name, + get_operator_type_name(op->op_type).c_str(), + op->numInputs, + op->numOutputs, + op->numWeights, + numInputs, + numOutputs, + numWeights); + printf("op_input_idx:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_idx[i]); + } + printf("\n"); + printf("op_input_source:\t"); + for (int i = 0; i < input_offset + op->numInputs; i++) { + printf("%i\t", op_input_source[i]); + } + printf("\n"); + printf("op_output_idx:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_idx[i]); + } + printf("\n"); + printf("op_output_source:\t"); + for (int i = 0; i < output_offset + op->numOutputs; i++) { + printf("%i\t", op_output_source[i]); + } + printf("\n"); + printf("op_weight_idx:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_idx[i]); + } + printf("\n"); + printf("op_weight_source:\t"); + for (int i = 0; i < weight_offset + op->numWeights; i++) { + printf("%i\t", op_weight_source[i]); + } + printf("\n"); +#endif + return true; } @@ -404,9 +525,13 @@ void FusedOp::init_inference(FFModel const &ff, } for (int i = 0; i < op_num_outputs[op]; i++) { int my_off = op_output_idx[i + ooff]; - assert(op_output_source[i + ooff] == SOURCE_OUTPUT); - assert(my_off < batch_outputs.size()); - my_batch_outputs.push_back(batch_outputs[my_off]); + if (op_output_source[i + ooff] == SOURCE_OUTPUT) { + my_batch_outputs.push_back(batch_outputs[my_off]); + } else if (op_output_source[i + ooff] == SOURCE_INPUT) { + my_batch_outputs.push_back(batch_inputs[my_off]); + } else { + assert(false); + } } ioff += op_num_inputs[op]; ooff += op_num_outputs[op]; @@ -526,10 +651,6 @@ FutureMap FusedOp::inference(FFModel const &ff, set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; size_t machine_view_hash = view->hash(); - // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig - // so we transfer the maximum of them - // size_t batch_config_size = - // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); IndexLauncher launcher(FUSEDOP_INF_TASK_ID, parallel_is, TaskArgument(nullptr, 0), @@ -571,6 +692,83 @@ FutureMap FusedOp::inference(FFModel const &ff, batch_outputs[i]->region)); launcher.add_field(offset + i, FID_DATA); } + offset += numOutputs; + // add softmax output grad + if (operators[numOperators - 1]->op_type == OP_SOFTMAX) { + // printf("operator %i is last SOFTMAX! adding grad for output %i\n", + // numOperators - 1, + // numOutputs - 1); + assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[numOutputs - 1]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[numOutputs - 1]->region_grad)); + launcher.add_field(offset, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +FutureMap FusedOp::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + // Set iter_config + iter_config = ff.iter_config; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + size_t machine_view_hash = view->hash(); + // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig + // so we transfer the maximum of them + // size_t batch_config_size = + // std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig)); + IndexLauncher launcher(FUSEDOP_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int offset = 0; + for (int i = 0; i < numInputs; i++) { + assert(inputs[i]->part != LogicalPartition::NO_PART); + assert(inputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[i]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[i]->region_grad)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numInputs; + for (int i = 0; i < numWeights; i++) { + assert(weights[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement(RegionRequirement(weights[i]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[i]->region)); + launcher.add_field(offset + i, FID_DATA); + } + offset += numWeights; + for (int i = 0; i < numOutputs; i++) { + assert(outputs[i]->region != LogicalRegion::NO_REGION); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[i]->part_grad, + 0 /*projection id*/, + i == numOutputs - 1 ? READ_WRITE : WRITE_ONLY, + EXCLUSIVE, + batch_outputs[i]->region_grad)); + launcher.add_field(offset + i, FID_DATA); + } return runtime->execute_index_space(ctx, launcher); } diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 3282bc57d9..9f826cd611 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -15,6 +15,7 @@ #include "flexflow/ops/fused.h" #include "flexflow/accessor.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" @@ -30,6 +31,7 @@ #include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/residual_rms_norm_kernels.h" @@ -42,6 +44,7 @@ #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" #include "flexflow/utils/hip_helper.h" #include @@ -78,17 +81,27 @@ OpMeta *FusedOp::init_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void FusedOp::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void + FusedOp::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active tokens + if (bc->num_tokens == 0) { + return; + } + assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); - assert((int)regions.size() == - fused->numInputs + fused->numWeights + fused->numOutputs); + bool softmax_grad_additional_region = + (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX); + assert((int)regions.size() == fused->numInputs + fused->numWeights + + fused->numOutputs + + softmax_grad_additional_region); GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; @@ -124,6 +137,7 @@ __host__ void FusedOp::forward_task(Task const *task, ctx, runtime); } + roff += fused->numOutputs; // Assert that all meta share the same dnn/blas handler int start = 0; for (start = 0; start < fused->numOperators; start++) { @@ -138,11 +152,6 @@ __host__ void FusedOp::forward_task(Task const *task, } } - hipStream_t stream; - if (start < fused->numOperators) { - checkCUDA(get_legion_stream(&stream)); - } - int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; @@ -163,8 +172,9 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -179,21 +189,6 @@ __host__ void FusedOp::forward_task(Task const *task, m->legion_axis); break; } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_accessor[1].get_float_ptr()); - break; - } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -209,16 +204,6 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[1].get_float_ptr()); break; } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - break; - } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -229,25 +214,48 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - float const *bias_ptr = nullptr; + void const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].get_float_ptr(); + bias_ptr = my_weight_accessor[1].ptr; } } else { assert(fused->op_num_weights[op] == 1); } - Kernels::Linear::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - bias_ptr, - in_dim, - out_dim, - batch_size); + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_infr_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_accessor[0].domain; + Domain output_domain = my_output_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->output_type[0] == my_output_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); + Kernels::LoraLinear::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } case OP_BATCHMATMUL: { @@ -375,87 +383,127 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } break; } - case OP_POOL2D: { + case OP_RMS_NORM: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } - case OP_SOFTMAX: { + case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_RESHAPE: { + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + TreeVerifyBatchConfig const &tree_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + &tree_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_TRANSPOSE: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + // BeamSearchBatchConfig const *beam_bc = + // (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &beam_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); + &beam_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } case OP_LAYERNORM: { @@ -477,23 +525,127 @@ __host__ void FusedOp::forward_task(Task const *task, break; } case OP_RESIDUAL_LAYERNORM: { - assert(false && "Operator ResidualLayerNorm does not support " - "the forward() task"); + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(false && "Operator AddBiasResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(false && "Operator ResidualRMSNorm does not support " - "the forward() task"); + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_input_accessor[1], + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); break; } case OP_SIGMOID_SILU_MULTI: { - assert(false && "Operator SigmoidSiluMulti does not support " - "the forward() task"); + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + if (op == fused->numOperators - 1) { // if this is the final operator + output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO( + fused->output_data_types[fused->numOutputs - 1], + regions[roff], + task->regions[roff], + FID_DATA, + ctx, + runtime); + } + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::inference_kernel_wrapper( + m, + bc, + (op == fused->numOperators - 1), + my_input_accessor[0], + my_output_accessor[0], + output_accessor[fused->numOutputs]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } default: { @@ -503,6 +655,33 @@ __host__ void FusedOp::forward_task(Task const *task, assert(false && "Fusion currently does not support type"); } } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); + } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; @@ -517,18 +696,525 @@ __host__ void FusedOp::forward_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void - FusedOp::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void FusedOp::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; + // BatchConfig const *bc = (BatchConfig *)task->args; BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - if (bc->num_tokens == 0) { + // Return if no active PEFT bwd tokens + if (bc->num_active_peft_tokens() == 0) { return; } + + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); + // Domain input_domain[MAX_NUM_INPUTS]; + // Domain weight_domain[MAX_NUM_WEIGHTS]; + // Domain output_domain[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + // input_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i].region.get_index_space()); + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + int roff = fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + // weight_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + // output_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + int ioff = 0, woff = 0, ooff = 0; + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + + for (int op = fused->numOperators - 1; op >= 0; op--) { + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + // my_id[i] = input_domain[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + // my_id[i] = output_domain[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; + // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; + // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // TODO: implement this + assert(false); + // ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + // int num_inputs = fused->op_num_inputs[op]; + // Kernels::Concat::peft_bwd_kernel_wrapper(m, + // my_output_accessor[0], + // my_input_accessor, + // num_inputs, + // m->legion_axis); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_dim() == 5); + assert(my_output_grad_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + // TODO: implement this + assert(false); + // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + // BatchNorm::peft_bwd_kernel_kernel( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_weight_accessor[0].get_float_ptr(), + // my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LinearMeta *m = (LinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->input_type[0] == my_output_grad_accessor[0].data_type); + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + Kernels::Linear::peft_bwd_kernel_wrapper(m, + my_input_grad_accessor[0].ptr, + my_output_grad_accessor[0].ptr, + my_weight_accessor[0].ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_grad_accessor[0].domain; + Domain output_domain = my_output_grad_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->output_type[0] == my_output_grad_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); + Kernels::LoraLinear::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_grad_accessor[0].domain; + Domain a_domain = my_input_grad_accessor[0].domain; + Domain b_domain = my_input_grad_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + // TODO: implement me + assert(false); + // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + // Kernels::BatchMatmul::backward_kernel_wrapper( + // meta, + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].get_float_ptr(), + // my_input_accessor[1].get_float_ptr(), + // (float const *)nullptr, + // m, + // n, + // k, + // batch, + // meta->a_seq_length_dim, + // meta->b_seq_length_dim, + // fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_input_grad_accessor[1].domain); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + // Kernels::ElementBinary::forward_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // my_output_accessor[0]); + break; + } + case OP_EMBEDDING: { + // Currently assume the Embedding layer cannot be finetuned + // so we do nothing for embedding + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // TODO: implement me + assert(false); + // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + // if (m->data_type == DT_HALF) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_half_ptr(), + // my_output_accessor[0].get_half_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else if (m->data_type == DT_FLOAT) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else { + // assert(false && "Unsupported data type in ElementUnary forward"); + // } + break; + } + case OP_RMS_NORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_weight_accessor[0]); + break; + } + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( + m, + bc, + my_input_grad_accessor[0], + my_input_grad_accessor[1], + my_output_grad_accessor[0], + my_output_grad_accessor[1], + my_weight_accessor[0]); + break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_grad_accessor[0], + my_weight_accessor[0], + my_output_grad_accessor[0], + biases); + break; + } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + // TODO: implement me + assert(false); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::peft_bwd_kernel_wrapper( + m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma); + break; + } + case OP_RESIDUAL_LAYERNORM: { + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorW residual2; + if (m->use_two_residuals) { + residual2 = my_input_grad_accessor[2]; + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + } + ResidualLayerNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + residual2, + gamma); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + } + + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + gamma); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_input_grad_accessor[1]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_grad_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_grad_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save, + false); + } + } +} + +/* + regions[...](I): inputs + regions[...](I): weights + regions[...](O): outputs +*/ +__host__ void FusedOp::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); assert((int)regions.size() == @@ -582,11 +1268,6 @@ __host__ void } } - hipStream_t stream; - if (start < fused->numOperators) { - checkCUDA(get_legion_stream(&stream)); - } - int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; @@ -595,8 +1276,10 @@ __host__ void for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + assert(my_off < fused->numInputs); my_input_accessor[i] = input_accessor[my_off]; } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + assert(my_off < fused->numOutputs); my_input_accessor[i] = output_accessor[my_off]; } else { assert(false); @@ -604,11 +1287,14 @@ __host__ void } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + assert(fused->op_weight_idx[i + woff] < fused->numWeights); my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[i + ooff]; + assert(my_off < fused->numOutputs); + my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -623,6 +1309,21 @@ __host__ void m->legion_axis); break; } + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -638,6 +1339,16 @@ __host__ void my_weight_accessor[1].get_float_ptr()); break; } + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -648,27 +1359,25 @@ __host__ void assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - void const *bias_ptr = nullptr; + float const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].ptr; + bias_ptr = my_weight_accessor[1].get_float_ptr(); } } else { assert(fused->op_num_weights[op] == 1); } - assert(m->input_type[0] == my_input_accessor[0].data_type); - assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_tokens(); - Kernels::Linear::forward_kernel_wrapper(m, - my_input_accessor[0].ptr, - my_output_accessor[0].ptr, - my_weight_accessor[0].ptr, - bias_ptr, - in_dim, - out_dim, - batch_size); + Kernels::Linear::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + bias_ptr, + in_dim, + out_dim, + batch_size); break; } case OP_BATCHMATMUL: { @@ -796,124 +1505,78 @@ __host__ void case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: - case OP_SCALAR_TRUE_DIV: { + case OP_ELU: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - if (m->data_type == DT_HALF) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr(), - my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_FLOAT) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false && "Unsupported data type in ElementUnary forward"); - } + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_RMS_NORM: { + case OP_POOL2D: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); break; } - case OP_RESIDUAL_RMS_NORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + case OP_FLAT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - IncMultiHeadSelfAttentionMeta const *m = - (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); break; } - case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - TreeIncMultiHeadSelfAttentionMeta *m = - (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - TreeVerifyBatchConfig const &tree_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &tree_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - SpecIncMultiHeadSelfAttentionMeta const *m = - (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // BeamSearchBatchConfig const *beam_bc = - // (BeamSearchBatchConfig *)task->args; - BeamSearchBatchConfig const &beam_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( m, - &beam_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); break; } case OP_LAYERNORM: { @@ -935,119 +1598,23 @@ __host__ void break; } case OP_RESIDUAL_LAYERNORM: { - assert(fused->op_num_outputs[op] == 2); - ResidualLayerNormMeta const *m = - (ResidualLayerNormMeta *)metas->meta[op]; - if (m->use_two_residuals) { - assert(fused->op_num_inputs[op] == 3); - } else { - assert(fused->op_num_inputs[op] == 2); - } - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 0); - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 1); // weight - } else { - assert(fused->op_num_weights[op] == 2); // weight + bias - } - } - GenericTensorAccessorR residual2; - if (m->use_two_residuals) { - residual2 = my_input_accessor[2]; - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; - } - } - ResidualLayerNorm::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - residual2, - my_output_accessor[0], - my_output_accessor[1], - gamma, - beta); + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 2); - AddBiasResidualLayerNormMeta const *m = - (AddBiasResidualLayerNormMeta *)metas->meta[op]; - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 1); // attn bias - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 2); // attn bias + weight - } else { - assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias - } - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[1]; - if (m->use_bias) { - beta = my_weight_accessor[2]; - } - } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); - AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - attn_bias_dim, - residual_volume, - my_input_accessor[0], - my_output_accessor[0], - my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], - gamma, - beta); + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); break; } case OP_SIGMOID_SILU_MULTI: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; - SigmoidSiluMulti::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_output_accessor[0]); - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } + assert(false && "Operator SigmoidSiluMulti does not support " + "the forward() task"); break; } - case OP_ALLREDUCE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + case OP_RESIDUAL_RMS_NORM: { + assert(false && "Operator ResidualRMSNorm does not support " + "the forward() task"); break; } default: { @@ -1176,9 +1743,6 @@ __host__ void FusedOp::backward_task(Task const *task, } } - hipStream_t stream; - checkCUDA(get_legion_stream(&stream)); - int ioff = 0, woff = 0, ooff = 0; GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; @@ -1202,6 +1766,7 @@ __host__ void FusedOp::backward_task(Task const *task, if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { my_input_accessor[i] = input_accessor[my_off]; my_input_grad_accessor[i] = input_grad_accessor[my_off]; + assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { my_input_accessor[i] = output_accessor[my_off]; my_input_grad_accessor[i] = output_grad_accessor[my_off]; @@ -1220,9 +1785,9 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]]; - my_output_grad_accessor[i] = - output_grad_accessor[fused->op_output_idx[i + ooff]]; + int my_off = fused->op_output_idx[i + ooff]; + my_output_accessor[i] = output_accessor[my_off]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } switch (fused->op_op_type[op]) { diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 483028599e..cab28181da 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -14,6 +14,7 @@ */ #include "flexflow/accessor.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/ops/add_bias_residual_layer_norm.h" #include "flexflow/ops/batch_norm.h" @@ -30,6 +31,7 @@ #include "flexflow/ops/kernels/embedding_kernels.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" #include "flexflow/ops/kernels/pool_2d_kernels.h" #include "flexflow/ops/kernels/reshape_kernels.h" #include "flexflow/ops/kernels/residual_rms_norm_kernels.h" @@ -42,6 +44,7 @@ #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -77,27 +80,32 @@ OpMeta *FusedOp::init_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void FusedOp::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void + FusedOp::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active tokens + if (bc->num_tokens == 0) { + return; + } + assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); - assert((int)regions.size() == - fused->numInputs + fused->numWeights + fused->numOutputs); - // Domain input_domain[MAX_NUM_INPUTS]; - // Domain weight_domain[MAX_NUM_WEIGHTS]; - // Domain output_domain[MAX_NUM_OUTPUTS]; + bool softmax_grad_additional_region = + (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX); + assert((int)regions.size() == fused->numInputs + fused->numWeights + + fused->numOutputs + + softmax_grad_additional_region); GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - // input_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i].region.get_index_space()); input_accessor[i] = helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], @@ -109,8 +117,6 @@ __host__ void FusedOp::forward_task(Task const *task, int roff = fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - // weight_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_accessor[i] = helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], @@ -122,8 +128,6 @@ __host__ void FusedOp::forward_task(Task const *task, roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - // output_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_accessor[i] = helperGetGenericTensorAccessorWO(fused->output_data_types[i], regions[i + roff], @@ -132,6 +136,7 @@ __host__ void FusedOp::forward_task(Task const *task, ctx, runtime); } + roff += fused->numOutputs; // Assert that all meta share the same dnn/blas handler int start = 0; for (start = 0; start < fused->numOperators; start++) { @@ -148,36 +153,39 @@ __host__ void FusedOp::forward_task(Task const *task, int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { - // Domain my_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS]; +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - // my_id[i] = input_domain[my_off]; my_input_accessor[i] = input_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = input_accessor[%i]\n", i, my_off); +#endif } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - // my_id[i] = output_domain[my_off]; my_input_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_input_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } else { assert(false); } } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; - // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); - // my_od[i] = output_domain[my_off]; - // my_op[i] = output_ptr[my_off]; my_output_accessor[i] = output_accessor[my_off]; +#if 0 + printf("\tmy_output_accessor[%i] = output_accessor[%i]\n", i, my_off); +#endif } switch (fused->op_op_type[op]) { case OP_CONCAT: { @@ -192,21 +200,6 @@ __host__ void FusedOp::forward_task(Task const *task, m->legion_axis); break; } - case OP_CONV2D: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 5); - assert(my_output_accessor[0].domain.get_dim() == 5); - Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; - Kernels::Conv2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - my_weight_accessor[1].get_float_ptr()); - break; - } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -222,16 +215,6 @@ __host__ void FusedOp::forward_task(Task const *task, my_weight_accessor[1].get_float_ptr()); break; } - case OP_DROPOUT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - DropoutMeta *m = (DropoutMeta *)metas->meta[op]; - Kernels::Dropout::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - break; - } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -242,25 +225,48 @@ __host__ void FusedOp::forward_task(Task const *task, assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - float const *bias_ptr = nullptr; + void const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].get_float_ptr(); + bias_ptr = my_weight_accessor[1].ptr; } } else { assert(fused->op_num_weights[op] == 1); } - Kernels::Linear::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_weight_accessor[0].get_float_ptr(), - bias_ptr, - in_dim, - out_dim, - batch_size); + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->input_type[0] == my_output_accessor[0].data_type); + batch_size = bc->num_active_infr_tokens(); + Kernels::Linear::forward_kernel_wrapper(m, + my_input_accessor[0].ptr, + my_output_accessor[0].ptr, + my_weight_accessor[0].ptr, + bias_ptr, + in_dim, + out_dim, + batch_size); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_accessor[0].domain; + Domain output_domain = my_output_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_accessor[0].domain.get_volume() / in_dim; + assert(my_output_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_accessor[0].data_type); + assert(m->output_type[0] == my_output_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr); + Kernels::LoraLinear::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); break; } case OP_BATCHMATMUL: { @@ -388,88 +394,127 @@ __host__ void FusedOp::forward_task(Task const *task, case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: { + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain == my_output_accessor[0].domain); ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + if (m->data_type == DT_HALF) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_half_ptr(), + my_output_accessor[0].get_half_ptr(), + my_input_accessor[0].domain.get_volume()); + } else if (m->data_type == DT_FLOAT) { + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + } else { + assert(false && "Unsupported data type in ElementUnary forward"); + } break; } - case OP_POOL2D: { + case OP_RMS_NORM: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_weights[op] == 1); assert(fused->op_num_outputs[op] == 1); - // assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; - Kernels::Pool2D::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + RMSNormMeta *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0]); break; } - case OP_FLAT: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Flat::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_weight_accessor[0], + my_output_accessor[0], + my_output_accessor[1]); break; } - case OP_SOFTMAX: { + case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; } + IncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_RESHAPE: { + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - Kernels::Reshape::forward_kernel_wrapper( - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); + TreeIncMultiHeadSelfAttentionMeta *m = + (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + TreeVerifyBatchConfig const &tree_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( + m, + &tree_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } - case OP_TRANSPOSE: { + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - TransposeMeta *m = (TransposeMeta *)metas->meta[op]; - Kernels::Transpose::forward_kernel_wrapper( + SpecIncMultiHeadSelfAttentionMeta const *m = + (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; + // BeamSearchBatchConfig const *beam_bc = + // (BeamSearchBatchConfig *)task->args; + BeamSearchBatchConfig const &beam_bc = + Future(task->futures[0]).get_result(); + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain, - my_output_accessor[0].domain); + &beam_bc, + task->index_point.point_data[0], + my_input_accessor[0], + my_weight_accessor[0], + my_output_accessor[0], + biases); break; } case OP_LAYERNORM: { @@ -491,39 +536,694 @@ __host__ void FusedOp::forward_task(Task const *task, break; } case OP_RESIDUAL_LAYERNORM: { - assert(false && "Operator ResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(false && "Operator AddBiasResidualLayerNorm does not support " - "the forward() task"); - break; - } - case OP_SIGMOID_SILU_MULTI: { - assert(false && "Operator SigmoidSiluMulti does not support " - "the forward() task"); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(false && "Operator ResidualRMSNorm does not support " - "the forward() task"); - break; - } - default: { - fprintf(stderr, - "Fusion currently does not support type = %d\n", - fused->op_op_type[op]); - assert(false && "Fusion currently does not support type"); - } - } - ioff += fused->op_num_inputs[op]; + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorR residual2; + if (m->use_two_residuals) { + residual2 = my_input_accessor[2]; + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + ResidualLayerNorm::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + residual2, + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + if (m->use_bias) { + beta = my_weight_accessor[2]; + } + } + AddBiasResidualLayerNorm::inference_kernel_wrapper( + m, + bc, + my_input_accessor[0], + my_weight_accessor[0], + my_input_accessor[1], + my_output_accessor[0], + my_output_accessor[1], + gamma, + beta); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::inference_kernel_wrapper(m, + bc, + my_input_accessor[0], + my_input_accessor[1], + my_output_accessor[0]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + if (op == fused->numOperators - 1) { // if this is the final operator + output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO( + fused->output_data_types[fused->numOutputs - 1], + regions[roff], + task->regions[roff], + FID_DATA, + ctx, + runtime); + } + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::inference_kernel_wrapper( + m, + bc, + (op == fused->numOperators - 1), + my_input_accessor[0], + my_output_accessor[0], + output_accessor[fused->numOutputs]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::inference_kernel_wrapper( + m, bc, my_input_accessor[0], my_output_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save); + } + ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; } - // for (int i = 0; i < fused->numOutputs; i++) - // print_tensor(output_ptr[i], output_domain[i].get_volume(), - // "[Fused:forward:output]"); + // for (int i = 0; i < fused->numOutputs; i++) + // print_tensor(output_ptr[i], output_domain[i].get_volume(), + // "[Fused:forward:output]"); +} + +/* + regions[...](I): inputs + regions[...](I): weights + regions[...](O): outputs +*/ +__host__ void FusedOp::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + // const FusedOp* fused = (FusedOp*) task->args; + FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); + FusedOp const *fused = metas->fused_op; + // BatchConfig const *bc = (BatchConfig *)task->args; + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + // Return if no active PEFT bwd tokens + if (bc->num_active_peft_tokens() == 0) { + return; + } + + assert(metas->numOperators == fused->numOperators); + assert(regions.size() == task->regions.size()); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); + // Domain input_domain[MAX_NUM_INPUTS]; + // Domain weight_domain[MAX_NUM_WEIGHTS]; + // Domain output_domain[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + assert(fused->numInputs <= MAX_NUM_INPUTS); + for (int i = 0; i < fused->numInputs; i++) { + // input_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i].region.get_index_space()); + input_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->input_data_types[i], + regions[i], + task->regions[i], + FID_DATA, + ctx, + runtime); + } + int roff = fused->numInputs; + assert(fused->numWeights <= MAX_NUM_WEIGHTS); + for (int i = 0; i < fused->numWeights; i++) { + // weight_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + weight_accessor[i] = + helperGetGenericTensorAccessorRO(fused->weight_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + roff += fused->numWeights; + assert(fused->numOutputs <= MAX_NUM_OUTPUTS); + for (int i = 0; i < fused->numOutputs; i++) { + // output_domain[i] = runtime->get_index_space_domain( + // ctx, task->regions[i + roff].region.get_index_space()); + output_grad_accessor[i] = + helperGetGenericTensorAccessorRW(fused->output_data_types[i], + regions[i + roff], + task->regions[i + roff], + FID_DATA, + ctx, + runtime); + } + // Assert that all meta share the same dnn/blas handler + int start = 0; + for (start = 0; start < fused->numOperators; start++) { + if (metas->meta[start] != NULL) { + break; + } + } + for (int op = start + 1; op < fused->numOperators; op++) { + if (metas->meta[op] != NULL) { + assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas); + assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn); + } + } + + int ioff = 0, woff = 0, ooff = 0; + // Domain my_id[MAX_NUM_INPUTS]; + // Domain my_wd[MAX_NUM_WEIGHTS]; + // Domain my_od[MAX_NUM_OUTPUTS]; + GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; + GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; + GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; + + // Do backpropagation in the reverse ordering + for (int op = 0; op < fused->numOperators; op++) { + ioff += fused->op_num_inputs[op]; + woff += fused->op_num_weights[op]; + ooff += fused->op_num_outputs[op]; + } + + for (int op = fused->numOperators - 1; op >= 0; op--) { +#if 0 + std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl; +#endif + ioff -= fused->op_num_inputs[op]; + woff -= fused->op_num_weights[op]; + ooff -= fused->op_num_outputs[op]; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + int my_off = fused->op_input_idx[i + ioff]; + if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { + // my_id[i] = input_domain[my_off]; + my_input_grad_accessor[i] = input_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = input_grad_accessor[%i]\n", i, my_off); +#endif + } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { + // my_id[i] = output_domain[my_off]; + my_input_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_input_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif + } else { + assert(false); + } + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); + // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; + // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; + my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + int my_off = fused->op_output_idx[i + ooff]; + assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); + // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; + // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; + my_output_grad_accessor[i] = output_grad_accessor[my_off]; +#if 0 + printf("\tmy_output_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off); +#endif + } + switch (fused->op_op_type[op]) { + case OP_CONCAT: { + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + // TODO: implement this + assert(false); + // ConcatMeta *m = (ConcatMeta *)metas->meta[op]; + // int num_inputs = fused->op_num_inputs[op]; + // Kernels::Concat::peft_bwd_kernel_wrapper(m, + // my_output_accessor[0], + // my_input_accessor, + // num_inputs, + // m->legion_axis); + break; + } + case OP_BATCHNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_dim() == 5); + assert(my_output_grad_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 2); + assert(my_weight_accessor[1].domain.get_dim() == 2); + // TODO: implement this + assert(false); + // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; + // BatchNorm::peft_bwd_kernel_kernel( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_weight_accessor[0].get_float_ptr(), + // my_weight_accessor[1].get_float_ptr()); + break; + } + case OP_LINEAR: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + Domain kernel_domain = my_weight_accessor[0].domain; + int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1; + int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LinearMeta *m = (LinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->input_type[0] == my_output_grad_accessor[0].data_type); + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + Kernels::Linear::peft_bwd_kernel_wrapper(m, + my_input_grad_accessor[0].ptr, + my_output_grad_accessor[0].ptr, + my_weight_accessor[0].ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); + break; + } + case OP_LORA: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + Domain input_domain = my_input_grad_accessor[0].domain; + Domain output_domain = my_output_grad_accessor[0].domain; + int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1; + int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1; + int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim; + assert(my_output_grad_accessor[0].domain.get_volume() == + out_dim * batch_size); + assert(my_input_grad_accessor[0].domain.get_volume() == + in_dim * batch_size); + LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; + assert(m->input_type[0] == my_input_grad_accessor[0].data_type); + assert(m->output_type[0] == my_output_grad_accessor[0].data_type); + // Assert that the output and the second input are at the same place + // since we ``inplace'' the output for LoRA + assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); + Kernels::LoraLinear::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_BATCHMATMUL: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + Domain out_domain = my_output_grad_accessor[0].domain; + Domain a_domain = my_input_grad_accessor[0].domain; + Domain b_domain = my_input_grad_accessor[1].domain; + int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; + assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); + int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; + assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); + int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; + assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); + assert(a_domain.get_dim() == b_domain.get_dim()); + assert(a_domain.get_dim() == out_domain.get_dim()); + int batch = 1; + for (int i = 2; i < a_domain.get_dim(); i++) { + int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; + assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); + assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); + batch *= dim_size; + } + // TODO: implement me + assert(false); + // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; + // Kernels::BatchMatmul::backward_kernel_wrapper( + // meta, + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].get_float_ptr(), + // my_input_accessor[1].get_float_ptr(), + // (float const *)nullptr, + // m, + // n, + // k, + // batch, + // meta->a_seq_length_dim, + // meta->b_seq_length_dim, + // fused->iter_config.seq_length); + break; + } + case OP_EW_ADD: + case OP_EW_SUB: + case OP_EW_MUL: + case OP_EW_DIV: + case OP_EW_MAX: + case OP_EW_MIN: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_input_grad_accessor[1].domain); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op]; + // Kernels::ElementBinary::forward_kernel_wrapper(m, + // my_input_accessor[0], + // my_input_accessor[1], + // my_output_accessor[0]); + break; + } + case OP_EMBEDDING: { + // Currently assume the Embedding layer cannot be finetuned + // so we do nothing for embedding + break; + } + case OP_GELU: + case OP_RELU: + case OP_SIGMOID: + case OP_TANH: + case OP_ELU: + case OP_SCALAR_TRUE_DIV: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain == + my_output_grad_accessor[0].domain); + // TODO: implement me + assert(false); + // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + // if (m->data_type == DT_HALF) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_half_ptr(), + // my_output_accessor[0].get_half_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else if (m->data_type == DT_FLOAT) { + // ElementUnary::forward_kernel_wrapper( + // m, + // my_input_accessor[0].get_float_ptr(), + // my_output_accessor[0].get_float_ptr(), + // my_input_accessor[0].domain.get_volume()); + // } else { + // assert(false && "Unsupported data type in ElementUnary forward"); + // } + break; + } + case OP_RMS_NORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 1); + RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; + Kernels::RMSNorm::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_weight_accessor[0]); + break; + } + case OP_RESIDUAL_RMS_NORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_outputs[op] == 2); + ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; + Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( + m, + bc, + my_input_grad_accessor[0], + my_input_grad_accessor[1], + my_output_grad_accessor[0], + my_output_grad_accessor[1], + my_weight_accessor[0]); + break; + } + case OP_INC_MULTIHEAD_SELF_ATTENTION: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + IncMultiHeadSelfAttentionMeta *m = + (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; + assert(fused->op_num_weights[op] == + (1 + (int)(*m->qkv_bias || *m->final_bias))); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + assert(fused->op_num_weights[op] == 2); + biases = my_weight_accessor[1]; + } + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + my_input_grad_accessor[0], + my_weight_accessor[0], + my_output_grad_accessor[0], + biases); + break; + } + case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: + case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + // TODO: implement me + assert(false); + break; + } + case OP_LAYERNORM: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; + if (m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); + } + GenericTensorAccessorR gamma, beta; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + if (m->use_bias) { + beta = my_weight_accessor[1]; + } + } + LayerNorm::peft_bwd_kernel_wrapper( + m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma); + break; + } + case OP_RESIDUAL_LAYERNORM: { + assert(fused->op_num_outputs[op] == 2); + ResidualLayerNormMeta const *m = + (ResidualLayerNormMeta *)metas->meta[op]; + if (m->use_two_residuals) { + assert(fused->op_num_inputs[op] == 3); + } else { + assert(fused->op_num_inputs[op] == 2); + } + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 0); + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 1); // weight + } else { + assert(fused->op_num_weights[op] == 2); // weight + bias + } + } + GenericTensorAccessorW residual2; + if (m->use_two_residuals) { + residual2 = my_input_grad_accessor[2]; + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[0]; + } + ResidualLayerNorm::peft_bwd_kernel_wrapper(m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + residual2, + gamma); + break; + } + case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 2); + AddBiasResidualLayerNormMeta const *m = + (AddBiasResidualLayerNormMeta *)metas->meta[op]; + if (!m->elementwise_affine) { + assert(fused->op_num_weights[op] == 1); // attn bias + } else { + if (!m->use_bias) { + assert(fused->op_num_weights[op] == 2); // attn bias + weight + } else { + assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias + } + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = my_weight_accessor[1]; + } + + AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( + m, + my_output_grad_accessor[1], + my_input_grad_accessor[0], + my_input_grad_accessor[1], + gamma); + break; + } + case OP_SIGMOID_SILU_MULTI: { + assert(fused->op_num_inputs[op] == 2); + assert(fused->op_num_outputs[op] == 1); + SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; + SigmoidSiluMulti::peft_bwd_kernel_wrapper(m, + bc, + my_output_grad_accessor[0], + my_input_grad_accessor[0], + my_input_grad_accessor[1]); + break; + } + case OP_SOFTMAX: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_grad_accessor[0].domain.get_volume() == + my_output_grad_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_ALLREDUCE: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + Kernels::AllReduce::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + Kernels::ParallelIdentity::peft_bwd_kernel_wrapper( + m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + break; + } + default: { + fprintf(stderr, + "Fusion currently does not support type = %d\n", + fused->op_op_type[op]); + assert(false && "Fusion currently does not support type"); + } + } + if (metas->meta[op]->inference_debugging && + !(fused->op_op_type[op] == OP_ALLREDUCE || + fused->op_op_type[op] == OP_PARALLEL_IDENTITY || + fused->op_op_type[op] == OP_REPLICATE || + fused->op_op_type[op] == OP_REPARTITION || + fused->op_op_type[op] == OP_COMBINE)) { + std::vector input_accessors_to_save; + std::vector weight_accessors_to_save; + std::vector output_accessors_to_save; + for (int i = 0; i < fused->op_num_inputs[op]; i++) { + input_accessors_to_save.push_back(my_input_grad_accessor[i]); + } + for (int i = 0; i < fused->op_num_weights[op]; i++) { + weight_accessors_to_save.push_back(my_weight_accessor[i]); + } + for (int i = 0; i < fused->op_num_outputs[op]; i++) { + output_accessors_to_save.push_back(my_output_grad_accessor[i]); + } + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save, + false); + } + } } /* @@ -531,35 +1231,22 @@ __host__ void FusedOp::forward_task(Task const *task, regions[...](I): weights regions[...](O): outputs */ -__host__ void - FusedOp::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { +__host__ void FusedOp::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { // const FusedOp* fused = (FusedOp*) task->args; - FusedOpMeta *metas = *((FusedOpMeta **)task->local_args); + FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args); FusedOp const *fused = metas->fused_op; - // BatchConfig const *bc = (BatchConfig *)task->args; - BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - // Return if no active tokens - if (bc->num_tokens == 0) { - return; - } - assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); assert((int)regions.size() == fused->numInputs + fused->numWeights + fused->numOutputs); - // Domain input_domain[MAX_NUM_INPUTS]; - // Domain weight_domain[MAX_NUM_WEIGHTS]; - // Domain output_domain[MAX_NUM_OUTPUTS]; GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - // input_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i].region.get_index_space()); input_accessor[i] = helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], @@ -571,8 +1258,6 @@ __host__ void int roff = fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - // weight_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_accessor[i] = helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], @@ -584,8 +1269,6 @@ __host__ void roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - // output_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_accessor[i] = helperGetGenericTensorAccessorWO(fused->output_data_types[i], regions[i + roff], @@ -610,20 +1293,15 @@ __host__ void int ioff = 0, woff = 0, ooff = 0; for (int op = 0; op < fused->numOperators; op++) { - // Domain my_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS]; GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS]; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - // my_id[i] = input_domain[my_off]; assert(my_off < fused->numInputs); my_input_accessor[i] = input_accessor[my_off]; } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - // my_id[i] = output_domain[my_off]; assert(my_off < fused->numOutputs); my_input_accessor[i] = output_accessor[my_off]; } else { @@ -632,8 +1310,6 @@ __host__ void } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; - // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; assert(fused->op_weight_idx[i + woff] < fused->numWeights); my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; } @@ -641,8 +1317,6 @@ __host__ void int my_off = fused->op_output_idx[i + ooff]; assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); assert(my_off < fused->numOutputs); - // my_od[i] = output_domain[fused->op_output_idx[i + ooff]]; - // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]]; my_output_accessor[i] = output_accessor[my_off]; } switch (fused->op_op_type[op]) { @@ -658,6 +1332,21 @@ __host__ void m->legion_axis); break; } + case OP_CONV2D: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain.get_dim() == 5); + assert(my_weight_accessor[0].domain.get_dim() == 5); + assert(my_output_accessor[0].domain.get_dim() == 5); + Conv2DMeta *m = (Conv2DMeta *)metas->meta[op]; + Kernels::Conv2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + my_weight_accessor[1].get_float_ptr()); + break; + } case OP_BATCHNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -673,6 +1362,16 @@ __host__ void my_weight_accessor[1].get_float_ptr()); break; } + case OP_DROPOUT: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_outputs[op] == 1); + DropoutMeta *m = (DropoutMeta *)metas->meta[op]; + Kernels::Dropout::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); + break; + } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -683,27 +1382,25 @@ __host__ void assert(my_output_accessor[0].domain.get_volume() == out_dim * batch_size); assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size); - void const *bias_ptr = nullptr; + float const *bias_ptr = nullptr; LinearMeta *m = (LinearMeta *)metas->meta[op]; if (fused->op_num_weights[op] == 2) { assert(my_weight_accessor[1].domain.get_volume() == out_dim); if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) { - bias_ptr = my_weight_accessor[1].ptr; + bias_ptr = my_weight_accessor[1].get_float_ptr(); } } else { assert(fused->op_num_weights[op] == 1); } - assert(m->input_type[0] == my_input_accessor[0].data_type); - assert(m->input_type[0] == my_output_accessor[0].data_type); - batch_size = bc->num_active_tokens(); - Kernels::Linear::forward_kernel_wrapper(m, - my_input_accessor[0].ptr, - my_output_accessor[0].ptr, - my_weight_accessor[0].ptr, - bias_ptr, - in_dim, - out_dim, - batch_size); + Kernels::Linear::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_weight_accessor[0].get_float_ptr(), + bias_ptr, + in_dim, + out_dim, + batch_size); break; } case OP_BATCHMATMUL: { @@ -831,126 +1528,78 @@ __host__ void case OP_RELU: case OP_SIGMOID: case OP_TANH: - case OP_ELU: - case OP_SCALAR_TRUE_DIV: { + case OP_ELU: { + assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); + assert(fused->op_num_outputs[op] == 1); + assert(my_input_accessor[0].domain == my_output_accessor[0].domain); + ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; + ElementUnary::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); + break; + } + case OP_POOL2D: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain == my_output_accessor[0].domain); - ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - if (m->data_type == DT_HALF) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr(), - my_input_accessor[0].domain.get_volume()); - } else if (m->data_type == DT_FLOAT) { - ElementUnary::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr(), - my_input_accessor[0].domain.get_volume()); - } else { - assert(false && "Unsupported data type in ElementUnary forward"); - } + Pool2DMeta *m = (Pool2DMeta *)metas->meta[op]; + Kernels::Pool2D::forward_kernel_wrapper( + m, + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr()); break; } - case OP_RMS_NORM: { + case OP_FLAT: { assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; - Kernels::RMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0]); - break; - } - case OP_RESIDUAL_RMS_NORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 1); - assert(fused->op_num_outputs[op] == 2); - ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; - Kernels::ResidualRMSNorm::forward_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_weight_accessor[0], - my_output_accessor[0], - my_output_accessor[1]); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Flat::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_SOFTMAX: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - IncMultiHeadSelfAttentionMeta const *m = - (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; + Kernels::Softmax::forward_kernel_wrapper( + m, my_input_accessor[0], my_output_accessor[0]); break; } - case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_RESHAPE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - TreeIncMultiHeadSelfAttentionMeta *m = - (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // TreeVerifyBatchConfig const *tree_bc = - // (TreeVerifyBatchConfig *)task->args; - TreeVerifyBatchConfig const &tree_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, - &tree_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + Kernels::Reshape::forward_kernel_wrapper( + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain.get_volume()); break; } - case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { + case OP_TRANSPOSE: { assert(fused->op_num_inputs[op] == 1); + assert(fused->op_num_weights[op] == 0); assert(fused->op_num_outputs[op] == 1); - SpecIncMultiHeadSelfAttentionMeta const *m = - (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; - // BeamSearchBatchConfig const *beam_bc = - // (BeamSearchBatchConfig *)task->args; - BeamSearchBatchConfig const &beam_bc = - Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } - SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( + assert(my_input_accessor[0].domain.get_volume() == + my_output_accessor[0].domain.get_volume()); + TransposeMeta *m = (TransposeMeta *)metas->meta[op]; + Kernels::Transpose::forward_kernel_wrapper( m, - &beam_bc, - task->index_point.point_data[0], - my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_input_accessor[0].get_float_ptr(), + my_output_accessor[0].get_float_ptr(), + my_input_accessor[0].domain, + my_output_accessor[0].domain); break; } case OP_LAYERNORM: { @@ -972,119 +1621,23 @@ __host__ void break; } case OP_RESIDUAL_LAYERNORM: { - assert(fused->op_num_outputs[op] == 2); - ResidualLayerNormMeta const *m = - (ResidualLayerNormMeta *)metas->meta[op]; - if (m->use_two_residuals) { - assert(fused->op_num_inputs[op] == 3); - } else { - assert(fused->op_num_inputs[op] == 2); - } - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 0); - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 1); // weight - } else { - assert(fused->op_num_weights[op] == 2); // weight + bias - } - } - GenericTensorAccessorR residual2; - if (m->use_two_residuals) { - residual2 = my_input_accessor[2]; - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[0]; - if (m->use_bias) { - beta = my_weight_accessor[1]; - } - } - ResidualLayerNorm::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - residual2, - my_output_accessor[0], - my_output_accessor[1], - gamma, - beta); + assert(false && "Operator ResidualLayerNorm does not support " + "the forward() task"); break; } case OP_ADD_BIAS_RESIDUAL_LAYERNORM: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 2); - AddBiasResidualLayerNormMeta const *m = - (AddBiasResidualLayerNormMeta *)metas->meta[op]; - if (!m->elementwise_affine) { - assert(fused->op_num_weights[op] == 1); // attn bias - } else { - if (!m->use_bias) { - assert(fused->op_num_weights[op] == 2); // attn bias + weight - } else { - assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias - } - } - GenericTensorAccessorR gamma, beta; - if (m->elementwise_affine) { - gamma = my_weight_accessor[1]; - if (m->use_bias) { - beta = my_weight_accessor[2]; - } - } - Domain attn_bias_domain = my_weight_accessor[0].domain; - Domain residual_domain = my_input_accessor[1].domain; - int attn_bias_dim = - attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1; - int residual_volume = residual_domain.get_volume(); - AddBiasResidualLayerNorm::inference_kernel_wrapper( - m, - attn_bias_dim, - residual_volume, - my_input_accessor[0], - my_output_accessor[0], - my_output_accessor[1], - my_input_accessor[1], - my_weight_accessor[0], - gamma, - beta); + assert(false && "Operator AddBiasResidualLayerNorm does not support " + "the forward() task"); break; } case OP_SIGMOID_SILU_MULTI: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_outputs[op] == 1); - SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; - SigmoidSiluMulti::inference_kernel_wrapper(m, - my_input_accessor[0], - my_input_accessor[1], - my_output_accessor[0]); - break; - } - case OP_SOFTMAX: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_accessor[0].domain.get_volume() == - my_output_accessor[0].domain.get_volume()); - SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; - if (m->input_type == DT_HALF) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_half_ptr(), - my_output_accessor[0].get_half_ptr()); - } else if (m->input_type == DT_FLOAT) { - Kernels::Softmax::forward_kernel_wrapper( - m, - my_input_accessor[0].get_float_ptr(), - my_output_accessor[0].get_float_ptr()); - } + assert(false && "Operator SigmoidSiluMulti does not support " + "the forward() task"); break; } - case OP_ALLREDUCE: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; - Kernels::AllReduce::inference_kernel_wrapper( - m, bc, my_input_accessor[0], my_output_accessor[0]); + case OP_RESIDUAL_RMS_NORM: { + assert(false && "Operator ResidualRMSNorm does not support " + "the forward() task"); break; } default: { @@ -1094,37 +1647,6 @@ __host__ void assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging) { - std::vector input_accessors_to_save; - std::vector weight_accessors_to_save; - std::vector output_accessors_to_save; - for (int i = 0; i < fused->op_num_inputs[op]; i++) { - int my_off = fused->op_input_idx[i + ioff]; - if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - input_accessors_to_save.push_back(input_accessor[my_off]); - } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - input_accessors_to_save.push_back(output_accessor[my_off]); - } else { - assert(false); - } - } - for (int i = 0; i < fused->op_num_weights[op]; i++) { - assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - weight_accessors_to_save.push_back( - weight_accessor[fused->op_weight_idx[i + woff]]); - } - for (int i = 0; i < fused->op_num_outputs[op]; i++) { - output_accessors_to_save.push_back(output_accessor[i + ooff]); - } - assert(task->index_point.get_dim() == 1); - int shard_id = task->index_point.point_data[0]; - FusedOp::save_inference_tensors_to_file(metas->meta[op], - shard_id, - bc, - input_accessors_to_save, - weight_accessors_to_save, - output_accessors_to_save); - } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; ooff += fused->op_num_outputs[op]; @@ -1156,9 +1678,6 @@ __host__ void FusedOp::backward_task(Task const *task, int sum = fused->numInputs + fused->numWeights + fused->numOutputs; assert(sum * 2 == (int)regions.size()); } - // Domain input_domain[MAX_NUM_INPUTS], input_grad_domain[MAX_NUM_INPUTS]; - // Domain weight_domain[MAX_NUM_WEIGHTS], weight_grad_domain[MAX_NUM_WEIGHTS]; - // Domain output_domain[MAX_NUM_OUTPUTS], output_grad_domain[MAX_NUM_OUTPUTS]; GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; @@ -1168,8 +1687,6 @@ __host__ void FusedOp::backward_task(Task const *task, int roff = 0; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - // input_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i].region.get_index_space()); input_accessor[i] = helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], @@ -1181,8 +1698,6 @@ __host__ void FusedOp::backward_task(Task const *task, roff += fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - // weight_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_accessor[i] = helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], @@ -1194,8 +1709,6 @@ __host__ void FusedOp::backward_task(Task const *task, roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - // output_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_accessor[i] = helperGetGenericTensorAccessorRO(fused->output_data_types[i], regions[i + roff], @@ -1206,8 +1719,6 @@ __host__ void FusedOp::backward_task(Task const *task, } roff += fused->numOutputs; for (int i = 0; i < fused->numInputs; i++) { - // input_grad_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); input_grad_accessor[i] = helperGetGenericTensorAccessorRW(fused->input_data_types[i], regions[i + roff], @@ -1219,8 +1730,6 @@ __host__ void FusedOp::backward_task(Task const *task, } roff += fused->numInputs; for (int i = 0; i < fused->numWeights; i++) { - // weight_grad_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); weight_grad_accessor[i] = helperGetGenericTensorAccessorRW(fused->weight_data_types[i], regions[i + roff], @@ -1233,8 +1742,6 @@ __host__ void FusedOp::backward_task(Task const *task, } roff += fused->numWeights; for (int i = 0; i < fused->numOutputs; i++) { - // output_grad_domain[i] = runtime->get_index_space_domain( - // ctx, task->regions[i + roff].region.get_index_space()); output_grad_accessor[i] = helperGetGenericTensorAccessorRW(fused->output_data_types[i], regions[i + roff], @@ -1260,9 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task, } int ioff = 0, woff = 0, ooff = 0; - // Domain my_id[MAX_NUM_INPUTS], my_grad_id[MAX_NUM_INPUTS]; - // Domain my_wd[MAX_NUM_WEIGHTS], my_grad_wd[MAX_NUM_WEIGHTS]; - // Domain my_od[MAX_NUM_OUTPUTS], my_grad_od[MAX_NUM_OUTPUTS]; GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; @@ -1283,19 +1787,11 @@ __host__ void FusedOp::backward_task(Task const *task, for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - // my_id[i] = input_domain[my_off]; - // my_ip[i] = input_ptr[my_off]; my_input_accessor[i] = input_accessor[my_off]; - // my_grad_id[i] = input_grad_domain[my_off]; - // my_grad_ip[i] = input_grad_ptr[my_off]; my_input_grad_accessor[i] = input_grad_accessor[my_off]; assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - // my_id[i] = output_domain[my_off]; - // my_ip[i] = output_ptr[my_off]; my_input_accessor[i] = output_accessor[my_off]; - // my_grad_id[i] = output_grad_domain[my_off]; - // my_grad_ip[i] = output_grad_ptr[my_off]; my_input_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else { @@ -1304,11 +1800,7 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]]; - // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]]; my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; - // my_grad_wd[i] = weight_grad_domain[fused->op_weight_idx[i + woff]]; - // my_grad_wp[i] = weight_grad_ptr[fused->op_weight_idx[i + woff]]; my_weight_grad_accessor[i] = weight_grad_accessor[fused->op_weight_idx[i + woff]]; assert(my_weight_grad_accessor[i].domain.get_volume() == @@ -1317,11 +1809,7 @@ __host__ void FusedOp::backward_task(Task const *task, for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); int my_off = fused->op_output_idx[i + ooff]; - // my_od[i] = output_domain[my_off]; - // my_op[i] = output_ptr[my_off]; my_output_accessor[i] = output_accessor[my_off]; - // my_grad_od[i] = output_grad_domain[my_off]; - // my_grad_op[i] = output_grad_ptr[my_off]; my_output_grad_accessor[i] = output_grad_accessor[my_off]; assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc index f2f402737c..03b9a5199b 100644 --- a/src/ops/group_by.cc +++ b/src/ops/group_by.cc @@ -99,7 +99,7 @@ Group_byParams Group_by::get_params() const { Group_byParams params; params.n = this->n; params.alpha = this->alpha; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -271,7 +271,7 @@ OpMeta *Group_by::init_task(Task const *task, Runtime *runtime) { Group_by *gb = (Group_by *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha); + GroupByMeta *m = new GroupByMeta(handle, gb); m->profiling = gb->profiling; m->inference_debugging = gb->inference_debugging; std::strcpy(m->op_name, gb->name); @@ -579,7 +579,7 @@ bool Group_by::measure_operator_cost(Simulator *sim, } } - GroupByMeta *m = new GroupByMeta(sim->handler, n, alpha); + GroupByMeta *m = new GroupByMeta(sim->handler, this); // allocate sim->free_all(); diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp index 761c35f182..9ca6f77898 100644 --- a/src/ops/group_by.cpp +++ b/src/ops/group_by.cpp @@ -188,9 +188,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m, data_dim); } -GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) - : OpMeta(handler), alpha(_alpha) { - checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *))); +GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb) + : OpMeta(handler, gb), alpha(gb->alpha) { + checkCUDA(hipMalloc(&dev_region_ptrs, gb->n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { checkCUDA(hipFree(&dev_region_ptrs)); diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index 0ed09e20b3..43bcb900df 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -198,9 +198,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m, } } -GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha) - : OpMeta(handler), alpha(_alpha) { - checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *))); +GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb) + : OpMeta(handler, gb), alpha(gb->alpha) { + checkCUDA(cudaMalloc(&dev_region_ptrs, gb->n * sizeof(float *))); } GroupByMeta::~GroupByMeta(void) { checkCUDA(cudaFree(&dev_region_ptrs)); diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index aa60d0f19c..8219cf9e1f 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -363,7 +363,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims, quantization_type == DT_NONE ? this->data_type : quantization_type, nullptr /*owner_op*/, - true /*create_grad*/, + model.config.computationMode == COMP_MODE_INFERENCE + ? false + : true /*create_grad*/, initializer, CHOSEN_SYNC_TYPE); if (qkv_bias || final_bias) { @@ -871,6 +873,139 @@ void IncMultiHeadSelfAttention::inference_task( } } +FutureMap IncMultiHeadSelfAttention::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + int idx = 0; + IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(idx++, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(idx++, FID_DATA); + if (qkv_bias || final_bias) { + launcher.add_region_requirement( + RegionRequirement(weights[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[1]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(idx++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): input + regions[3](I): weight + regions[4](O): output +*/ +void IncMultiHeadSelfAttention::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d", + bc->num_tokens, + bc->num_active_requests()); + if (bc->num_active_peft_tokens() == 0) { + return; + } + + IncMultiHeadSelfAttentionMeta *m = + *((IncMultiHeadSelfAttentionMeta **)task->local_args); + + assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 + : regions.size() == 3)); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR biases; + if (*m->qkv_bias || *m->final_bias) { + biases = helperGetGenericTensorAccessorRO(m->weight_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + Domain bias_domain = runtime->get_index_space_domain( + ctx, task->regions[3].region.get_index_space()); + assert(bias_domain.get_dim() == 4); + } + + Domain input_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + Domain output_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(input_grad_domain.get_dim() == 4); + assert(weight_domain.get_dim() == 2); + assert(output_grad_domain.get_dim() == 4); + + assert(task->index_point.get_dim() == 1); + + IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + m, + bc, + task->index_point.point_data[0], + input_grad, + weight, + output_grad, + biases); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + IncMultiHeadSelfAttention::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + } +} + void IncMultiHeadSelfAttention::backward(FFModel const &ff) { // IncMultiHeadSelfAttention does not support backward assert(false); @@ -926,7 +1061,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.quantization_type = this->quantization_type; params.offload = this->offload; params.num_kv_heads = this->num_kv_heads; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index d60386f927..826fea4347 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -12,13 +12,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/utils/hip_helper.h" -#include +#include "hip/hip_complex.h" #include namespace FlexFlow { @@ -27,9 +27,288 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +#define WARP_SIZE 32 + namespace Kernels { namespace IncMultiHeadAttention { +template +__device__ __forceinline__ T + WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_sync(mask, var, srcLane, width); +#else + return __shfl(var, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T + WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(mask, var, laneMask, width); +#else + return __shfl_xor(var, laneMask, width); +#endif +} + +// gridDim = num_heads +// blockDim = num_tokens/num_request * head_size +// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads| +// one thread process one head_size +template +__global__ void compute_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + // eg. if head_size = 128, thread_per_key = 4, with float32 precision + // then K_VEC_SIZE = 1, QK_VEC_SIZE = 4 + // K_ELTS_PER_THREAD = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 / 1 = 32 + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + // constexpr int QK_VEC_SIZE = 16 / sizeof(DT); + // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + int const first_step = 0; + + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + // DT const *q_ptr = + // query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size; + + // q tensor in this thread + // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total + // K_VECS_PER_THREAD elements + // QK_vec_k: 32->1, 64->2, 128->4... head_size + // K_vec_k: 4->1, 2->2, 1->4 threads_per_key + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + __syncthreads(); + // first iter = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 + // K_PER_ITER how many keys in this loop + // The number of timesteps loaded per iteration. + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + // get k, perform qk proj + + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + k[ii] = *reinterpret_cast(k_cache_batch + + ti_circ * hidden_size + + head_idx * per_head_size + jj); + } + // Compute dot product. + // This includes a reduction across the threads in the same thread group. + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + // // todo add positional embedding to the qk production + // // Store the product to shared memory. There's one qk value per + // timestep. + // // Update the max. + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + bool const mask = ti_circ >= tlength; + if (mask) { + assert(false); + } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0); + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + float logit = __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("softmax %.10f\n", qk_smem[0]); + // } + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + float logit = qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float( + *reinterpret_cast(output_ptr + request_idx * hidden_size + + head_idx * per_head_size + vi), + out); + } +} + // only used by MPT model. https://arxiv.org/abs/2108.12409 template __global__ void apply_position_bias_qkprd(DT *input_ptr, @@ -86,8 +365,10 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, // int qkv_index = i / (num_tokens * qProjSize) % 3; int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); - size_t in_token_idx = i - token_idx * hidden_size * 3; + size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM; + int qkv_index = in_token_idx / hidden_size; + int proj_size = qkv_index == 0 ? qProjSize : kProjSize; int head_idx = @@ -109,6 +390,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr, } } } + template __global__ void scaling_query_kernel(DT *input_ptr, int qProjSize, @@ -158,6 +440,10 @@ __global__ void int token_idx = (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + // float before_real = complex_input[i].x, before_complex = + // complex_input[i].y; + int pos_i = real_i % (proj_size / 2); float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); hipFloatComplex complex_pos = {cos(freq), sin(freq)}; @@ -189,7 +475,7 @@ __global__ void int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); int real_part_index = idx + head_idx * proj_size + - token_idx * hidden_size * 3 + + token_idx * hidden_size * QKV_WEIGHT_NUM + hidden_size * (q_tensor ? 0 : 1); int complex_part_index = real_part_index + (proj_size / 2); @@ -217,28 +503,59 @@ __global__ void } template -__global__ void store_kv_cache(DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, +__global__ void + apply_rotary_embedding_bwd(DT *input_ptr, + hipFloatComplex *complex_input, BatchConfig::PerTokenInfo const *tokenInfos, + int proj_size, int num_tokens, - int max_seq_len, int hidden_size) { CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - int offset = i % hidden_size; - size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset; - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; + // compute indexes to visit first half proj_size of each of q/k tensor. + // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd + bool q_tensor = i < (num_tokens * hidden_size / 2); + int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; + assert(hidden_size % proj_size == 0); + int num_heads = hidden_size / proj_size; + + int token_idx = real_i % num_tokens; + int idx = (real_i / num_tokens) % (proj_size / 2); + int head_idx = real_i / (num_tokens * proj_size / 2); + assert(head_idx < num_heads); - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size + + head_idx * num_tokens * proj_size + + idx * num_tokens + token_idx; + int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; - // key cache - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); + hipFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = hipCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; } } @@ -254,56 +571,68 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; assert(m->qSize == m->vSize && m->qSize == m->kSize); - hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif - // Compute (W^T)x matmul: einsum(ijkl,im->jmkl) - // Weights: qSize x qProjSize x 3 x num_q_heads - // Input: qSize x num_tokens - // Output >>> qProjSize x num_tokens x 3 x num_q_heads - int m_q = m->qProjSize * m->num_q_heads; - int m_k = m->kProjSize * m->num_q_heads; - int m_v = m->vProjSize * m->num_q_heads; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); - int k = m->qSize; - int m_ = m_q * QKV_WEIGHT_NUM; - int lda = k, ldb = k, ldc = m_; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - hipblas_data_type, - lda, - input_ptr, - hipblas_data_type, - ldb, - &beta, - output_ptr, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - // apply rotary emmmbedding for q and k - // step1 change the k, v to complex tensor + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + hipblasDatatype_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + + // Step 1: Compute QKV projections + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_q = m->qProjSize * m->num_q_heads; + int m_k = m->kProjSize * m->num_q_heads; + int m_v = m->vProjSize * m->num_q_heads; + assert(m_q == m_k && m_k == m_v); // keep things simple for now + int n = bc->num_active_infr_tokens(); + int k = m->qSize; + int m_ = m_q * QKV_WEIGHT_NUM; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: QKV weights + // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] + // matrix B: input + // matrix B's layout: [qSize (hidden_dim), num_new_tokens] + // matrix C: devQKVProjArray + // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + weight_ptr, + cublas_data_type, + lda, + input_ptr, + cublas_data_type, + ldb, + &beta, + output_ptr, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; - // apply bias for q, k, v + + // Step 2: apply bias for QKV, or scale the query if (*m->qkv_bias) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -321,7 +650,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->scaling_factor, m->hidden_size); } else if (m->scaling_query) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -333,10 +662,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, m->scaling_factor, m->hidden_size); } + + // Step 3: apply rotary embedding if needed if (*m->apply_rotary_embedding) { /*q&k*/ parallelism = num_tokens * m->hidden_size; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -352,14 +683,42 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +template +__global__ void store_kv_cache(DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int num_tokens, + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + // key cache + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; + } +} + template void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, hipStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); if (num_tokens > 0) { int parallelism = m->hidden_size * num_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -374,6 +733,129 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, } } +template +void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + DT const *weight_ptr, + DT const *bias_ptr, + int num_tokens, + hipStream_t stream) { + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); +#if CUDA_VERSION >= 11000 + // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance + hipblasDatatype_t compute_type = HIPBLAS_R_16F; +#else + hipblasDatatype_t compute_type = cublas_data_type; +#endif + // Project to output, save result directly on output tensor + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = num_tokens; + // before transpositions + int lda = k, ldb = k, ldc = m_; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: attn heads + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->attn_heads); + // matrix B: output + // matrix B's layout: [oProjSize, num_new_tokens] + DT *C = static_cast
(output_ptr); + + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Add final output bias + if (*m->final_bias && shard_id == 0) { + int parallelism = m->oProjSize * num_tokens; + int qkv_weight_size = m->qProjSize * m->global_num_q_heads + + m->kProjSize * m->global_num_q_heads + + m->vProjSize * m->global_num_q_heads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + bias_ptr, + num_tokens, + qkv_weight_size, + m->oProjSize); + } +} + +#define LAUNCH_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length(), \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos) + +template +void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + hipStream_t stream) { + dim3 grid(m->num_q_heads, bc->num_generation_tokens); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} + template void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, @@ -393,27 +875,29 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, if (m->quantization_type == DT_INT4) { int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; - decompress_int4_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); + hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int4_attention_weights), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_q_heads); } else { assert(m->quantization_type == DT_INT8); int parallelism = m->qProjSize * m->qSize * m->num_q_heads; - decompress_int8_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); + hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int8_attention_weights), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + m->qProjSize, + m->qSize, + m->num_q_heads); } } else { if (data_type == DT_FLOAT) { @@ -435,7 +919,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, } template -void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, +void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *input_ptr, @@ -443,19 +927,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, DT *output_ptr, DT const *bias_ptr, hipStream_t stream) { - // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { checkCUDA(hipMemcpyAsync( m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); bias_ptr = static_cast
(m->bias_ptr); } - checkCUDA(hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(BatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream)); + // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -465,14 +943,520 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, static_cast
(m->devQKVProjArray), bias_ptr, stream); - - // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + if (bc->num_generation_tokens > 0) { + // phase 3: Compute attention score for generation tokens + compute_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } + + if (bc->num_tokens > bc->num_generation_tokens) { + // phase 4: Compute attention score for prompt tokens; + compute_attention_kernel_prompt( + m, bc, shard_id, bias_ptr, weight_ptr, stream); + } + + // compute output production and bias together for all tokens + int num_tokens = bc->num_active_tokens(); + compute_o_prod_bias( + m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); +} + +std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); +} + +template +void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *input_grad_ptr, + DT const *weight_ptr, + DT const *output_grad_ptr, + DT const *bias_ptr, + hipStream_t stream) { + assert(!m->offload); + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); + hipblasDatatype_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + // Currently assume we are calculating gradients for all tokens + // of a request + assert(num_tokens == num_total_tokens); + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + // Step 1: compute gradients before final projection + { + int m_ = m->vProjSize * m->num_q_heads; + int n_ = num_tokens; + int k_ = m->oProjSize; + int lda = m_; + int ldb = k_; + int ldc = m_; + float alpha = 1.0f, beta = 0.0f; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: output gradients + // matrix B's layout: [oProjSize, num_new_tokens] + DT const *B = + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; + // matrix C: attn_heads gradients + // matrix C's layout: [vProjSize * num_heads, num_new_tokens] + DT *C = static_cast
(m->handle.workSpace); + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0"; + save_tensor(C, m_ * n_, filename.c_str()); + } + } + // Step 2: compute gradients w.r.t. value + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: qk_prods_softmax + // matrix A's layout: [num_new_tokens, total_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods_softmax); + // matrix B: attn_heads gradients + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->handle.workSpace); + // matrix C: gradients for value (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + + 2 * num_tokens * + (m->qProjSize * m->num_q_heads); // skip over regions reserved + // for Q and K gradients + // after transpositions + int m_ = num_tokens; // total_tokens + int n_ = m->vProjSize; // num_new_tokens + int k_ = num_tokens; // num_new_tokens + // before transpositions + int lda = num_tokens; // num_new_tokens + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // total_tokens + // N.B. strides are applied before transpose operations + int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens + int strideB = m->vProjSize; + int strideC = num_tokens * m->vProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + // save result to file for checking + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0"; + save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax"; + save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str()); + } + } + // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: attn_heads gradients + // matrix A's layout: [vProjSize * num_heads, num_new_tokens] + DT const *A = static_cast
(m->handle.workSpace); + // matrix B: value cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix C: qk_prods_softmax gradients + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + DT *C = static_cast
(m->qk_prods_softmax); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = num_tokens; + int k_ = m->vProjSize; + // before transposition and striding + int lda = m->vProjSize * m->num_q_heads; + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // num_new_tokens + int strideA = m->vProjSize; + int strideB = m->vProjSize; + int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens + + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache"; + save_tensor( + B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str()); + } + } + // Step 4: softmax backpropagation + { + float alpha = 1.0f, beta = 0.0f; + int n_param = m->num_q_heads; + int c_param = num_tokens; + int h_param = 1; + int w_param = num_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param)); + checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn, + &alpha, + m->qk_tensor, + m->softmax_activation_buffer, + m->qk_tensor, + m->qk_prods_softmax, + &beta, + m->qk_tensor, + m->qk_prods, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + + // TODO: fill all elements above diagonal to force causal attention + size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->qk_prods), + num_tokens, + num_tokens, + m->num_q_heads, + entries_above_diagonal, + DT(0.0f)); + } + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = get_peft_dbg_folder(m, shard_id) + + ".qk_prods.softmax_grad_in.masked"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + } + // Step 5: compute gradients w.r.t. key + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: query activation (in query_activation_buffer) + // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->query_activation_buffer); + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + // after transposition & striding + int m_ = num_tokens; + int n_ = m->kProjSize; + int k_ = num_tokens; // num_new_tokens + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->kProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->kProjSize; + int strideC = num_tokens * m->kProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".query_activation"; + save_tensor( + B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".devkproj_pre"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str()); + } + } + // Step 6: compute gradients w.r.t query + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: key cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: gradients for query (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = m->qProjSize; + int k_ = num_tokens; + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->qProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->qProjSize; + int strideC = num_tokens * m->qProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // Step 7: perform rotary position embeddings (RoPE) bwd + { + if (*m->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_bwd), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + A, + m->complex_input, + m->token_infos, + m->qProjSize, + num_tokens, + m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + if (m->inference_debugging) { + std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); + } + } + + // Step 8: compute gradients w.r.t. input + { + float alpha = 1.0f, beta = 0.0f; + if (!m->reset_input_grads[0]) { + beta = 1.0f; + } + // matrix A: QKV projection weights + // matrix A's layout: [qSize, qProjSize * num_q_heads, 3] + DT const *A = weight_ptr; + // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) + // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] + DT const *B = static_cast
(m->devQKVProjArray); + // matrix C: gradients w.r.t. input + // matrix C's layout: [m->qSize, num_tokens] + DT *C = input_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; + int m_ = m->qSize; + int n_ = num_tokens; + int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int lda = m_; + int ldb = n_; + int ldc = m_; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; + save_tensor(C, num_tokens * m->qSize, filename.c_str()); + } + } + } } } // namespace IncMultiHeadAttention @@ -481,42 +1465,47 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, using namespace Kernels::IncMultiHeadAttention; template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; +__global__ void store_query_cache(DT const *devQKVProjArray, + DT *qCache_ptr, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; + + DT qVal = devQKVProjArray[val_idx]; + + // query cache + qCache_ptr[i] = qVal; } } template -void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - hipStream_t stream) { +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + DT const *bias_ptr, + DT const *weight_ptr, + hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); - miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif + hipblasDatatype_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; @@ -530,64 +1519,102 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - // a flag of using this scaling alpha - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + // Copy query to m->query_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed > m->allocated_peft_buffer_size1) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size1 = activation_size_needed; + } + int parallelism = m->hidden_size * num_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->devQKVProjArray), + static_cast
(m->query_activation_buffer), + num_tokens, + m->hidden_size); } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests + // Step 1: compute query-key product QK.T/sqrt(d_k) + { + // Scale by sqrt(d_k) as per the original attention paper + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // after transpositions + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + // before transpositions + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + // N.B. strides are applied before transpose operations + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // matrix A: devQKVProjArray + // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] + // To get query projection, skip over Q entries from previous requests + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // matrix B: key cache + // matrix B's layout: [kProjSize * num_heads, total_tokens] + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Step 2: Add alibi position bias to qk production + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests DT *C = static_cast
(m->qk_prods); - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd), GET_BLOCKS(parallelism), min((size_t)CUDA_NUM_THREADS, parallelism), 0, @@ -599,13 +1626,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, m->global_num_q_heads, shard_id); } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. + + // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods + // with -inf to force causal attention. assert(num_new_tokens <= total_tokens); size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; if (entries_above_diagonal > 0) { size_t parallelism = m->num_q_heads * entries_above_diagonal; - hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal), GET_BLOCKS(parallelism), min((size_t)CUDA_NUM_THREADS, parallelism), 0, @@ -617,137 +1645,129 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m, entries_above_diagonal, static_cast
(-INFINITY)); } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(miopenSet4dTensorDescriptor( - m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; - k = total_tokens; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + - // padding) - B = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_q_heads * m->vProjSize; - - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + tokens_previous_requests * m->oProjSize; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + // Step 4: Compute Softmax(QK.T/sqrt(d_k)) + { + // Before modifying the parameters below, make sure to read the following + // description of the HIPDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } + // Copy C_softmax to m->softmax_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; + if (activation_size_needed > m->allocated_peft_buffer_size2) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size2 = activation_size_needed; + } + checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer, + C_softmax, + sizeof(DT) * total_tokens * num_new_tokens * + m->num_q_heads, + hipMemcpyDeviceToDevice, + stream)); + } + // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ + // softmax(QK.T/sqrt(d_k)).T + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->vProjSize; + int n = num_new_tokens; + int k = total_tokens; + // before transpositions + int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + // N.B. strides are applied before transpose operations + int strideA = vt_block_size; + int strideB = num_new_tokens * total_tokens; + int strideC = m->vProjSize; + // matrix A: value cache + // matrix A's layout: [vProjSize, num_heads, total_tokens] + // To get A, skip over V.T entries from previous requests (all heads + + // padding) + DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix B: qk_prods_softmax + // matrix B's layout: [num_new_tokens, total_tokens, num_heads] + // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous + // requests (all heads) + DT *B = static_cast
(m->qk_prods_softmax); + // matrix C: attn heads + // matrix C's layout: [vProjSize, num_heads, num_new_tokens] + // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous + // requests + // store the result attn heads, also skip the genration tokens + DT *C = static_cast
(m->attn_heads) + + (bc->requestsInfo[i].first_token_offset_in_batch) * + m->num_q_heads * m->vProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } tokens_previous_requests += num_new_tokens; } - - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - num_tokens, - qkv_weight_size, - m->oProjSize); + if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { + bc->print(); + printf("tokens_previous_requests: %i\n", tokens_previous_requests); + printf("num_tokens: %i\n", num_tokens); + printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); } - - assert(tokens_previous_requests == num_tokens); + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( - IncMultiHeadSelfAttentionMeta const *m, + IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, @@ -813,10 +1833,71 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); checkCUDA(hipEventDestroy(t_start)); checkCUDA(hipEventDestroy(t_end)); - printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); + printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed); + } +} + +/*static*/ +void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &bias) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->qkv_bias || *m->final_bias; + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + // assert(input.data_type == weight.data_type); + assert(input_grad.data_type == output_grad.data_type); + if (use_bias) { + assert(input_grad.data_type == bias.data_type); + } + + if (input_grad.data_type == DT_HALF) { + assert(!m->offload); + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_half_ptr(), + weight.get_half_ptr(), + output_grad.get_half_ptr(), + bias_ptr, + stream); + } else if (input_grad.data_type == DT_FLOAT) { + assert(!m->offload); + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_float_ptr(), + weight.get_float_ptr(), + output_grad.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed); } } @@ -895,7 +1976,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( assert(kSize == vSize); qProjSize = _qProjSize; kProjSize = _kProjSize; - assert(qProjSize == kProjSize); // required for attention QK^T matmul + assert(qProjSize == kProjSize); // required for attention QK.T matmul vProjSize = _vProjSize; oProjSize = _oProjSize; size_t size_of_dt = data_type_size(attn->data_type); @@ -949,14 +2030,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); + int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE + ? BatchConfig::max_verify_tokens_per_batch() + : BatchConfig::max_tokens_per_batch(); size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads + vProjSize * num_q_heads); size_t key_cache_size = 0, value_cache_size = 0; switch (infer_mode) { - case INC_DECODING_MODE: - case TREE_VERIFY_MODE: { + case INC_DECODING_MODE: { key_cache_size = num_q_heads * kProjSize * BatchConfig::max_requests_per_batch() * BatchConfig::max_sequence_length(); @@ -965,21 +2047,24 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( BatchConfig::max_sequence_length(); break; } - case BEAM_SEARCH_MODE: { + case BEAM_SEARCH_MODE: + case TREE_VERIFY_MODE: { + // a K-ary tree max node is (k^n - 1) / 2 key_cache_size = num_q_heads * kProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); value_cache_size = num_q_heads * vProjSize * BeamSearchBatchConfig::max_requests_per_batch() * - BatchConfig::max_sequence_length() * - BeamSearchBatchConfig::MAX_BEAM_WIDTH; + (BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); break; } default: assert(false && "Unkown inference mode"); } - size_t tokeninfo_size = max_tokens_per_batch; + size_t requestinfo_size = BatchConfig::max_requests_per_batch(); + // size_t tokeninfo_size = max_tokens_per_batch; size_t qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads; size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize; @@ -990,7 +2075,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size) * size_of_dt + - tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) + complex_size * sizeof(hipFloatComplex); // more components will // be added here later if (offload) { @@ -1035,10 +2119,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); + token_infos = static_cast( + handler.batch_config_metadata->tokens_info); + request_infos = static_cast( + handler.batch_config_metadata->requestsInfo); + if (offload) { - token_infos = - gpu_mem_allocator.allocate_reserved( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_reserved( + // tokeninfo_size); // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); @@ -1052,10 +2141,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( complex_input = gpu_mem_allocator.allocate_reserved(complex_size); // offset += complex_size * sizeof(hipFloatComplex); + // request_infos = + // gpu_mem_allocator.allocate_reserved( + // requestinfo_size); } else { - token_infos = - gpu_mem_allocator.allocate_instance( - tokeninfo_size); + // token_infos = + // gpu_mem_allocator.allocate_instance( + // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -1064,6 +2156,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); + // request_infos = + // gpu_mem_allocator.allocate_instance( + // requestinfo_size); } // allocate more size for quantization data @@ -1077,6 +2172,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; checkCUDA(hipStreamSynchronize(stream)); } @@ -1098,4 +2195,37 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( DataType data_type, hipStream_t stream); +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int num_tokens, + hipStream_t stream); + +template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + half *output_ptr, + half const *weight_ptr, + half const *bias_ptr, + int num_tokens, + hipStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + hipStream_t stream); + +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + hipStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index a0d31bb6ef..b278611b60 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -12,9 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" -#endif #include "flexflow/ffconst_utils.h" #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/kernels/decompress_kernels.h" @@ -483,6 +481,63 @@ __global__ void } } +template +__global__ void + apply_rotary_embedding_bwd(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + int proj_size, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + // compute indexes to visit first half proj_size of each of q/k tensor. + // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd + bool q_tensor = i < (num_tokens * hidden_size / 2); + int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; + assert(hidden_size % proj_size == 0); + int num_heads = hidden_size / proj_size; + + int token_idx = real_i % num_tokens; + int idx = (real_i / num_tokens) % (proj_size / 2); + int head_idx = real_i / (num_tokens * proj_size / 2); + assert(head_idx < num_heads); + + int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size + + head_idx * num_tokens * proj_size + + idx * num_tokens + token_idx; + int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -497,17 +552,18 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); assert(m->qSize == m->vSize && m->qSize == m->kSize); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // Step 1: Compute QKV projections { @@ -517,7 +573,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, int m_k = m->kProjSize * m->num_q_heads; int m_v = m->vProjSize * m->num_q_heads; assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_tokens(); + int n = bc->num_active_infr_tokens(); int k = m->qSize; int m_ = m_q * QKV_WEIGHT_NUM; // before transpositions @@ -604,7 +660,7 @@ template void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); if (num_tokens > 0) { int parallelism = m->hidden_size * num_tokens; store_kv_cache<< -void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, +void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *input_ptr, @@ -843,6 +899,504 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m, m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); } +std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); +} + +template +void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *input_grad_ptr, + DT const *weight_ptr, + DT const *output_grad_ptr, + DT const *bias_ptr, + cudaStream_t stream) { + assert(!m->offload); + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); + cudaDataType_t compute_type = cublas_data_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + // Currently assume we are calculating gradients for all tokens + // of a request + assert(num_tokens == num_total_tokens); + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + // Step 1: compute gradients before final projection + { + int m_ = m->vProjSize * m->num_q_heads; + int n_ = num_tokens; + int k_ = m->oProjSize; + int lda = m_; + int ldb = k_; + int ldc = m_; + float alpha = 1.0f, beta = 0.0f; + // matrix A: output projection weight + // matrix A's layout: [vProjSize * num_heads, oProjSize] + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + // matrix B: output gradients + // matrix B's layout: [oProjSize, num_new_tokens] + DT const *B = + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; + // matrix C: attn_heads gradients + // matrix C's layout: [vProjSize * num_heads, num_new_tokens] + DT *C = static_cast
(m->handle.workSpace); + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0"; + save_tensor(C, m_ * n_, filename.c_str()); + } + } + // Step 2: compute gradients w.r.t. value + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: qk_prods_softmax + // matrix A's layout: [num_new_tokens, total_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods_softmax); + // matrix B: attn_heads gradients + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->handle.workSpace); + // matrix C: gradients for value (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + + 2 * num_tokens * + (m->qProjSize * m->num_q_heads); // skip over regions reserved + // for Q and K gradients + // after transpositions + int m_ = num_tokens; // total_tokens + int n_ = m->vProjSize; // num_new_tokens + int k_ = num_tokens; // num_new_tokens + // before transpositions + int lda = num_tokens; // num_new_tokens + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // total_tokens + // N.B. strides are applied before transpose operations + int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens + int strideB = m->vProjSize; + int strideC = num_tokens * m->vProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0"; + save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax"; + save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str()); + } + } + // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: attn_heads gradients + // matrix A's layout: [vProjSize * num_heads, num_new_tokens] + DT const *A = static_cast
(m->handle.workSpace); + // matrix B: value cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix C: qk_prods_softmax gradients + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + DT *C = static_cast
(m->qk_prods_softmax); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = num_tokens; + int k_ = m->vProjSize; + // before transposition and striding + int lda = m->vProjSize * m->num_q_heads; + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // num_new_tokens + int strideA = m->vProjSize; + int strideB = m->vProjSize; + int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens + + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache"; + save_tensor( + B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str()); + } + } + // Step 4: softmax backpropagation + { + float alpha = 1.0f, beta = 0.0f; + int n_param = m->num_q_heads; + int c_param = num_tokens; + int h_param = 1; + int w_param = num_tokens; + checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + n_param, + c_param, + h_param, + w_param)); + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->qk_tensor, + m->softmax_activation_buffer, + m->qk_tensor, + m->qk_prods_softmax, + &beta, + m->qk_tensor, + m->qk_prods)); + + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + + // TODO: fill all elements above diagonal to force causal attention + size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + fill_entries_above_diagonal<<>>(static_cast
(m->qk_prods), + num_tokens, + num_tokens, + m->num_q_heads, + entries_above_diagonal, + DT(0.0f)); + } + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = get_peft_dbg_folder(m, shard_id) + + ".qk_prods.softmax_grad_in.masked"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + } + // Step 5: compute gradients w.r.t. key + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: query activation (in query_activation_buffer) + // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->query_activation_buffer); + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + // after transposition & striding + int m_ = num_tokens; + int n_ = m->kProjSize; + int k_ = num_tokens; // num_new_tokens + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->kProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->kProjSize; + int strideC = num_tokens * m->kProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".query_activation"; + save_tensor( + B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".devkproj_pre"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str()); + } + } + // Step 6: compute gradients w.r.t query + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: key cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: gradients for query (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = m->qProjSize; + int k_ = num_tokens; + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->qProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->qProjSize; + int strideC = num_tokens * m->qProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // Step 7: perform rotary position embeddings (RoPE) bwd + { + if (*m->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + apply_rotary_embedding_bwd<<>>(A, + m->complex_input, + m->token_infos, + m->qProjSize, + num_tokens, + m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + if (m->inference_debugging) { + std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); + } + } + + // Step 8: compute gradients w.r.t. input + { + float alpha = 1.0f, beta = 0.0f; + if (!m->reset_input_grads[0]) { + beta = 1.0f; + } + // matrix A: QKV projection weights + // matrix A's layout: [qSize, qProjSize * num_q_heads, 3] + DT const *A = weight_ptr; + // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) + // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] + DT const *B = static_cast
(m->devQKVProjArray); + // matrix C: gradients w.r.t. input + // matrix C's layout: [m->qSize, num_tokens] + DT *C = input_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; + int m_ = m->qSize; + int n_ = num_tokens; + int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + int lda = m_; + int ldb = n_; + int ldc = m_; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + B, + cublas_data_type, + ldb, + &beta, + C, + cublas_data_type, + ldc, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; + save_tensor(C, num_tokens * m->qSize, filename.c_str()); + } + } + } +} + } // namespace IncMultiHeadAttention } // namespace Kernels @@ -877,24 +1431,25 @@ __global__ void store_kv_cache(DT const *devQKVProjArray, } template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; +__global__ void store_query_cache(DT const *devQKVProjArray, + DT *qCache_ptr, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; + + DT qVal = devQKVProjArray[val_idx]; + + // query cache + qCache_ptr[i] = qVal; } } template -void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, DT const *bias_ptr, @@ -905,17 +1460,18 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; @@ -929,12 +1485,35 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + // Copy query to m->query_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed > m->allocated_peft_buffer_size1) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size1 = activation_size_needed; + } + int parallelism = m->hidden_size * num_tokens; + store_query_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->query_activation_buffer), + num_tokens, + m->hidden_size); + } // Step 1: compute query-key product QK.T/sqrt(d_k) { // Scale by sqrt(d_k) as per the original attention paper @@ -1066,6 +1645,25 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, m->qk_tensor, C_softmax)); } + // Copy C_softmax to m->softmax_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; + if (activation_size_needed > m->allocated_peft_buffer_size2) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size2 = activation_size_needed; + } + checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer, + C_softmax, + sizeof(DT) * total_tokens * num_new_tokens * + m->num_q_heads, + cudaMemcpyDeviceToDevice, + stream)); + } // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ // softmax(QK.T/sqrt(d_k)).T { @@ -1090,7 +1688,6 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous // requests (all heads) DT *B = static_cast
(m->qk_prods_softmax); - ; // matrix C: attn heads // matrix C's layout: [vProjSize, num_heads, num_new_tokens] // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous @@ -1136,7 +1733,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m, /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( - IncMultiHeadSelfAttentionMeta const *m, + IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, @@ -1206,6 +1803,70 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( } } +/*static*/ +void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( + IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &bias) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + bool use_bias = *m->qkv_bias || *m->final_bias; + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // assert(input.data_type == weight.data_type); + assert(input_grad.data_type == output_grad.data_type); + if (use_bias) { + assert(input_grad.data_type == bias.data_type); + } + + if (input_grad.data_type == DT_HALF) { + assert(!m->offload); + half const *bias_ptr = + use_bias ? bias.get_half_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_half_ptr(), + weight.get_half_ptr(), + output_grad.get_half_ptr(), + bias_ptr, + stream); + } else if (input_grad.data_type == DT_FLOAT) { + assert(!m->offload); + float const *bias_ptr = + use_bias ? bias.get_float_ptr() : static_cast(nullptr); + Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, + bc, + shard_id, + input_grad.get_float_ptr(), + weight.get_float_ptr(), + output_grad.get_float_ptr(), + bias_ptr, + stream); + } else { + assert(false && "Unspported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed); + } +} + IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, @@ -1424,11 +2085,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size * size_of_dt); - token_infos = - static_cast(handler.batch_config_metadata); - request_infos = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo)); + token_infos = static_cast( + handler.batch_config_metadata->tokens_info); + request_infos = static_cast( + handler.batch_config_metadata->requestsInfo); if (offload) { // token_infos = @@ -1478,6 +2138,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; cudaStreamSynchronize(stream); } diff --git a/src/ops/kernels/batch_matmul.cpp b/src/ops/kernels/batch_matmul.cpp index 7145af2108..8eeede65c7 100644 --- a/src/ops/kernels/batch_matmul.cpp +++ b/src/ops/kernels/batch_matmul.cpp @@ -13,13 +13,15 @@ * limitations under the License. */ +#include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {} +BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm) + : OpMeta(handler, bmm) {} namespace Kernels { namespace BatchMatmul { diff --git a/src/ops/kernels/batch_matmul.cu b/src/ops/kernels/batch_matmul.cu index ac280db1a4..97f13fa5a8 100644 --- a/src/ops/kernels/batch_matmul.cu +++ b/src/ops/kernels/batch_matmul.cu @@ -13,12 +13,14 @@ * limitations under the License. */ +#include "flexflow/ops/batch_matmul.h" #include "flexflow/ops/kernels/batch_matmul_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {} +BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm) + : OpMeta(handler, bmm) {} namespace Kernels { namespace BatchMatmul { diff --git a/src/ops/kernels/cast_kernels.cpp b/src/ops/kernels/cast_kernels.cpp index 16b9b4cec0..1e561959f1 100644 --- a/src/ops/kernels/cast_kernels.cpp +++ b/src/ops/kernels/cast_kernels.cpp @@ -14,12 +14,13 @@ */ #include "flexflow/ops/kernels/cast_kernels.h" +#include "flexflow/ops/cast.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {} +CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {} namespace Kernels { namespace Cast { diff --git a/src/ops/kernels/cast_kernels.cu b/src/ops/kernels/cast_kernels.cu index a96f37dbbd..fdce63b9f1 100644 --- a/src/ops/kernels/cast_kernels.cu +++ b/src/ops/kernels/cast_kernels.cu @@ -13,12 +13,13 @@ * limitations under the License. */ +#include "flexflow/ops/cast.h" #include "flexflow/ops/kernels/cast_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {} +CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {} namespace Kernels { namespace Cast { diff --git a/src/ops/kernels/concat_kernels.cpp b/src/ops/kernels/concat_kernels.cpp index bf5d46b9cc..6c05e0143c 100644 --- a/src/ops/kernels/concat_kernels.cpp +++ b/src/ops/kernels/concat_kernels.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/concat_kernels.h" +#include "flexflow/ops/concat.h" #include "flexflow/utils/hip_helper.h" #include @@ -23,6 +24,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Rect; +ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc) + : OpMeta(handler, cc) {} + namespace Kernels { namespace Concat { diff --git a/src/ops/kernels/concat_kernels.cu b/src/ops/kernels/concat_kernels.cu index f625560625..2569c36b21 100644 --- a/src/ops/kernels/concat_kernels.cu +++ b/src/ops/kernels/concat_kernels.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "flexflow/ops/concat.h" #include "flexflow/ops/kernels/concat_kernels.h" #include "flexflow/utils/cuda_helper.h" @@ -22,6 +23,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Rect; +ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc) + : OpMeta(handler, cc) {} + namespace Kernels { namespace Concat { diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp index 7d2fa20c49..85a94ad6be 100644 --- a/src/ops/kernels/conv_2d_kernels.cpp +++ b/src/ops/kernels/conv_2d_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/ops/kernels/conv_2d_kernels.h" +#include "flexflow/ops/conv_2d.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) { +Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv) + : OpMeta(handler, conv) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); @@ -326,7 +328,7 @@ void backward_kernel(Conv2DMeta const *m, output_ptr, n * c * h * w); } - // Compute filter gradiant + // Compute filter gradient // NOTE: we use alpha for kernel_grad to accumulate gradients checkCUDNN(miopenConvolutionBackwardWeights(m->handle.dnn, &alpha, @@ -341,7 +343,7 @@ void backward_kernel(Conv2DMeta const *m, kernel_grad_ptr, m->handle.workSpace, m->handle.workSpaceSize)); - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { checkCUDNN(miopenConvolutionBackwardBias(m->handle.dnn, @@ -352,7 +354,7 @@ void backward_kernel(Conv2DMeta const *m, m->biasTensor, bias_grad_ptr)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDNN(miopenConvolutionBackwardData(m->handle.dnn, diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu index 6c0fd85496..661acdf732 100644 --- a/src/ops/kernels/conv_2d_kernels.cu +++ b/src/ops/kernels/conv_2d_kernels.cu @@ -1,9 +1,11 @@ +#include "flexflow/ops/conv_2d.h" #include "flexflow/ops/kernels/conv_2d_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) { +Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv) + : OpMeta(handler, conv) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); @@ -309,7 +311,7 @@ void backward_kernel(Conv2DMeta const *m, reluBackward<<>>( output_grad_ptr, output_ptr, n * c * h * w); } - // Compute filter gradiant + // Compute filter gradient // NOTE: we use alpha for kernel_grad to accumulate gradients checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn, &alpha, @@ -324,7 +326,7 @@ void backward_kernel(Conv2DMeta const *m, &alpha, m->filterDesc, kernel_grad_ptr)); - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn, @@ -335,7 +337,7 @@ void backward_kernel(Conv2DMeta const *m, m->biasTensor, bias_grad_ptr)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn, diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp index 14225f0bce..c8b1887fd4 100644 --- a/src/ops/kernels/dropout_kernels.cpp +++ b/src/ops/kernels/dropout_kernels.cpp @@ -28,7 +28,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, Domain const &output_domain) - : OpMeta(handler) { + : OpMeta(handler, dropout) { profiling = dropout->profiling; inference_debugging = dropout->inference_debugging; checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu index e142bba83b..d65b951f51 100644 --- a/src/ops/kernels/dropout_kernels.cu +++ b/src/ops/kernels/dropout_kernels.cu @@ -27,7 +27,7 @@ DropoutMeta::DropoutMeta(FFHandler handler, Dropout const *dropout, Memory gpu_mem, Domain const &output_domain) - : OpMeta(handler) { + : OpMeta(handler, dropout) { profiling = dropout->profiling; inference_debugging = dropout->inference_debugging; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); diff --git a/src/ops/kernels/flat_kernels.cpp b/src/ops/kernels/flat_kernels.cpp index be48854fc0..6815ce7492 100644 --- a/src/ops/kernels/flat_kernels.cpp +++ b/src/ops/kernels/flat_kernels.cpp @@ -14,11 +14,15 @@ */ #include "flexflow/ops/kernels/flat_kernels.h" +#include "flexflow/ops/flat.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { +FlatMeta::FlatMeta(FFHandler handler, Flat const *flat) + : OpMeta(handler, flat) {} + namespace Kernels { namespace Flat { diff --git a/src/ops/kernels/flat_kernels.cu b/src/ops/kernels/flat_kernels.cu index 3836c02c94..fc0c0270c1 100644 --- a/src/ops/kernels/flat_kernels.cu +++ b/src/ops/kernels/flat_kernels.cu @@ -13,11 +13,15 @@ * limitations under the License. */ +#include "flexflow/ops/flat.h" #include "flexflow/ops/kernels/flat_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { +FlatMeta::FlatMeta(FFHandler handler, Flat const *flat) + : OpMeta(handler, flat) {} + namespace Kernels { namespace Flat { diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 072eb5e96b..a36d6719c9 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -14,6 +14,8 @@ */ #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/utils/hip_helper.h" #include @@ -24,24 +26,53 @@ LinearMeta::LinearMeta(FFHandler handler, Linear const *li, MemoryAllocator gpu_mem_allocator, int weightSize) - : OpMeta(handler, li) { + : OpMeta(handler, li), weight_ptr(nullptr) { + DataType data_type = li->data_type; + // allocate weight and bias in the reserve space for cpu offloading + if (li->offload) { + weight_ptr = gpu_mem_allocator.allocate_reserved_untyped( + weightSize * data_type_size(data_type)); + if (li->quantization_type != DT_NONE) { + quantized_weightSize = get_quantization_to_byte_size( + data_type, li->quantization_type, weightSize); + quantized_weight_ptr = + gpu_mem_allocator.allocate_reserved(quantized_weightSize); + } + } // Allocate an all-one's vector - float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size); - for (int i = 0; i < batch_size; i++) { - dram_one_ptr[i] = 1.0f; + gpu_mem_allocator.create_legion_instance( + reserveInst, data_type_size(data_type) * batch_size); + one_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * batch_size); + int parallelism = batch_size; + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (data_type == DT_FLOAT) { + Kernels::Linear::Internal:: + build_one_ptr<<>>((float *)one_ptr, batch_size); + } else if (data_type == DT_HALF) { + Kernels::Linear::Internal:: + build_one_ptr<<>>((half *)one_ptr, batch_size); } - float *fb_one_ptr; - checkCUDA(hipMalloc(&fb_one_ptr, sizeof(float) * batch_size)); - checkCUDA(hipMemcpy(fb_one_ptr, - dram_one_ptr, - sizeof(float) * batch_size, - hipMemcpyHostToDevice)); - one_ptr = (void *)fb_one_ptr; + // Allocate descriptors checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); + + allocated_peft_buffer_size = 0; +} + +LinearMeta::~LinearMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } } -LinearMeta::~LinearMeta(void) {} namespace Kernels { namespace Linear { @@ -96,7 +127,61 @@ void forward_kernel_wrapper(LinearMeta const *m, int batch_size) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[Linear:forward:input]"); print_tensor((float*)weight_ptr, in_dim + // * out_dim, "[Linear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[Linear:forward:output]"); + } +} +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *weight_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); @@ -126,6 +211,67 @@ void forward_kernel_wrapper(LinearMeta const *m, stream); } + if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) { + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->output_type[0]) * max_peft_tokens * out_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->output_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy output activation + if (m->output_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->output_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + } + if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -134,12 +280,60 @@ void forward_kernel_wrapper(LinearMeta const *m, checkCUDA(hipEventDestroy(t_start)); checkCUDA(hipEventDestroy(t_end)); printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed); - // print_tensor(acc_input.ptr, acc_input.rect.volume(), - // "[Linear:forward:input]"); print_tensor(acc_kernel.ptr, - // acc_kernel.rect.volume(), "[Linear:forward:kernel]"); - // print_tensor(acc_bias.ptr, acc_bias.rect.volume(), - // "[Linear:forward:bias]"); print_tensor(acc_output.ptr, - // acc_output.rect.volume(), "[Linear:forward:output]"); + } +} + +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *weight_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[Linear:forward:input]"); print_tensor((float*)weight_ptr, in_dim + // * out_dim, "[Linear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[Linear:forward:output]"); } } @@ -223,8 +417,20 @@ Parameter* Linear::get_parameter(int index) } } */ - namespace Internal { + +template +__global__ void AddBiasWithReLU(DT *output_ptr, + DT const *bias_ptr, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, out_dim * batch_size) { + int bias_idx = i % out_dim; + DT value = output_ptr[i] + bias_ptr[bias_idx]; + output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f; + } +} + template void forward_kernel(LinearMeta const *m, void const *input_ptr, @@ -234,20 +440,57 @@ void forward_kernel(LinearMeta const *m, int in_dim, int out_dim, int batch_size, - hipStream_t stream) { + ffStream_t stream) { + // additional processing for uploading weights + if (m->offload) { + // Note that we update weight_ptr when uploading weight + if (m->quantization_type != DT_NONE) { + checkCUDA(hipMemcpyAsync(m->quantized_weight_ptr, + weight_ptr, + m->quantized_weightSize, + hipMemcpyHostToDevice, + stream)); + if (m->quantization_type == DT_INT4) { + int parallelism = in_dim * out_dim / 2; + decompress_int4_general_weights
+ <<>>(m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + in_dim, + in_dim * out_dim); + } else { + assert(m->quantization_type == DT_INT8); + int parallelism = in_dim * out_dim; + decompress_int8_general_weights
+ <<>>(m->quantized_weight_ptr, + static_cast
(m->weight_ptr), + in_dim, + in_dim * out_dim); + } + + } else { + checkCUDA(hipMemcpyAsync(m->weight_ptr, + weight_ptr, + in_dim * out_dim * sizeof(DT), + hipMemcpyHostToDevice, + stream)); + } + } checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); DT alpha = 1.0f, beta = 0.0f; hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); - hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + hipblasDatatype_t weight_type = m->offload + ? ff_to_cuda_datatype(m->weight_ptr_type) + : ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - hipblasDatatype_t compute_type = output_type; -#else - // TODO: currently use the output_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + assert(input_type == weight_type && weight_type == output_type); hipblasDatatype_t compute_type = output_type; -#endif checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, HIPBLAS_OP_N, @@ -255,7 +498,7 @@ void forward_kernel(LinearMeta const *m, batch_size, in_dim, &alpha, - weight_ptr, + m->offload ? m->weight_ptr : weight_ptr, weight_type, in_dim, input_ptr, @@ -269,6 +512,16 @@ void forward_kernel(LinearMeta const *m, HIPBLAS_GEMM_DEFAULT)); // use_bias = True if (bias_ptr != NULL) { + // fuse bias and relu + if (m->activation == AC_MODE_RELU) { + int parallelism = out_dim * batch_size; + AddBiasWithReLU<<>>( + static_cast
(output_ptr), + static_cast
(bias_ptr), + out_dim, + batch_size); + return; + } checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_T, HIPBLAS_OP_N, @@ -306,7 +559,7 @@ void forward_kernel(LinearMeta const *m, GET_BLOCKS(elements), CUDA_NUM_THREADS, 0, - 0, + stream, elements, B, C, @@ -318,6 +571,74 @@ void forward_kernel(LinearMeta const *m, } } +template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); + // update input_grad_ptr and output_grad_ptr offset + int num_infr_only_tokens = num_infr_tokens - num_peft_tokens; + input_grad_ptr = + static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; + output_grad_ptr = + static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; + hipblasDatatype_t compute_type = output_type; + int output_size = out_dim * num_peft_tokens; + if (m->activation == AC_MODE_RELU) { + relu_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else if (m->activation == AC_MODE_SIGMOID) { + sigmoid_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else { + // TODO: only support relu and sigmoid for now + assert(m->activation == AC_MODE_NONE); + } + + // Compute data gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + if (input_grad_ptr != NULL) { + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + in_dim, + num_peft_tokens, + out_dim, + &alpha, + kernel_ptr, + weight_type, + in_dim, + output_grad_ptr, + output_type, + out_dim, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } +} + template void backward_kernel(LinearMeta const *m, void const *input_ptr, @@ -335,16 +656,11 @@ void backward_kernel(LinearMeta const *m, checkCUDNN(miopenSetStream(m->handle.dnn, stream)); DT alpha = 1.0f; + float sgeam_alpha = 1.0f; hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = output_type; -#else - // TODO: currently use output_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = output_type; -#endif int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( @@ -356,7 +672,7 @@ void backward_kernel(LinearMeta const *m, // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); } - // Compute weight gradiant + // Compute weight gradient // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(hipblasGemmEx(m->handle.blas, HIPBLAS_OP_N, @@ -377,7 +693,27 @@ void backward_kernel(LinearMeta const *m, in_dim, compute_type, HIPBLAS_GEMM_DEFAULT)); - // Compute bias gradiant + if (m->kernel_reg_type == REG_MODE_NONE) { + // do nothing + } else if (m->kernel_reg_type == REG_MODE_L2) { + checkCUDA(hipblasSgeam(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + in_dim, + out_dim, + &sgeam_alpha, + (float *)kernel_grad_ptr, + in_dim, + &(m->kernel_reg_lambda), + (float *)kernel_ptr, + in_dim, + (float *)kernel_grad_ptr, + in_dim)); + } else { + assert(false && "Only L2 regularization is supported"); + } + + // Compute bias gradient // NOTE: we use alpha=1 for bias_grad to accumulate gradients // use_bias = True if (bias_grad_ptr != NULL) { @@ -388,7 +724,7 @@ void backward_kernel(LinearMeta const *m, out_dim, batch_size, &alpha, - m->one_ptr, + static_cast
(m->one_ptr), HIPBLAS_R_32F, 1, output_grad_ptr, @@ -401,7 +737,7 @@ void backward_kernel(LinearMeta const *m, compute_type, HIPBLAS_GEMM_DEFAULT)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(hipblasGemmEx(m->handle.blas, @@ -426,7 +762,14 @@ void backward_kernel(LinearMeta const *m, } } +template +__global__ void build_one_ptr(DT *one_ptr, int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size) { + one_ptr[i] = static_cast
(1.0f); + } +} + } // namespace Internal } // namespace Linear } // namespace Kernels -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index c30c9f71c1..d4f930db6c 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -63,6 +63,8 @@ LinearMeta::LinearMeta(FFHandler handler, // Allocate descriptors checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); + + allocated_peft_buffer_size = 0; } LinearMeta::~LinearMeta(void) { @@ -170,6 +172,172 @@ void forward_kernel_wrapper(LinearMeta const *m, } } +void inference_kernel_wrapper(LinearMeta *m, + BatchConfig const *bc, + void const *input_ptr, + void *output_ptr, + void const *weight_ptr, + void const *bias_ptr, + int in_dim, + int out_dim, + int batch_size) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->input_type[0] == DT_FLOAT) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::forward_kernel(m, + input_ptr, + output_ptr, + weight_ptr, + bias_ptr, + in_dim, + out_dim, + batch_size, + stream); + } + + if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) { + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->output_type[0]) * max_peft_tokens * out_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->output_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy output activation + if (m->output_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->output_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->output_activation_buffer, + static_cast(output_ptr) + first_token_offset * out_dim, + data_type_size(m->output_type[0]) * num_peft_tokens * out_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [Linear] inference time = %.2lfms\n", m->op_name, elapsed); + } +} + +void peft_bwd_kernel_wrapper(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *weight_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + input_grad_ptr, + output_grad_ptr, + weight_ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[Linear:forward:input]"); print_tensor((float*)weight_ptr, in_dim + // * out_dim, "[Linear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[Linear:forward:output]"); + } +} + void backward_kernel_wrapper(LinearMeta const *m, void const *input_ptr, void *input_grad_ptr, @@ -323,17 +491,7 @@ void forward_kernel(LinearMeta const *m, : ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); assert(input_type == weight_type && weight_type == output_type); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + cudaDataType_t compute_type = output_type; checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -398,7 +556,7 @@ void forward_kernel(LinearMeta const *m, size_t elements = (size_t)out_dim * (size_t)batch_size; constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI) - gelu_forward_kernel<<>>( + gelu_forward_kernel<<>>( elements, B, C, (float *)output_ptr); } else if (m->activation == AC_MODE_NONE) { // Do nothing @@ -407,6 +565,74 @@ void forward_kernel(LinearMeta const *m, } } +template +void peft_bwd_kernel(LinearMeta const *m, + void *input_grad_ptr, + void *output_grad_ptr, + void const *kernel_ptr, + int in_dim, + int out_dim, + int num_infr_tokens, + int num_peft_tokens, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + // update input_grad_ptr and output_grad_ptr offset + int num_infr_only_tokens = num_infr_tokens - num_peft_tokens; + input_grad_ptr = + static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; + output_grad_ptr = + static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; + cudaDataType_t compute_type = output_type; + int output_size = out_dim * num_peft_tokens; + if (m->activation == AC_MODE_RELU) { + relu_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else if (m->activation == AC_MODE_SIGMOID) { + sigmoid_backward_kernel(m->output_type[0], + output_grad_ptr, + m->output_activation_buffer, + output_size, + stream); + } else { + // TODO: only support relu and sigmoid for now + assert(m->activation == AC_MODE_NONE); + } + + // Compute data gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + if (input_grad_ptr != NULL) { + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + out_dim, + &alpha, + kernel_ptr, + weight_type, + in_dim, + output_grad_ptr, + output_type, + out_dim, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } +} + template void backward_kernel(LinearMeta const *m, void const *input_ptr, @@ -428,17 +654,7 @@ void backward_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + cudaDataType_t compute_type = output_type; int output_size = out_dim * batch_size; if (m->activation == AC_MODE_RELU) { relu_backward_kernel( @@ -450,7 +666,7 @@ void backward_kernel(LinearMeta const *m, // TODO: only support relu and sigmoid for now assert(m->activation == AC_MODE_NONE); } - // Compute weight gradiant + // Compute weight gradient // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, @@ -491,7 +707,7 @@ void backward_kernel(LinearMeta const *m, assert(false && "Only L2 regularization is supported"); } - // Compute bias gradiant + // Compute bias gradient // NOTE: we use alpha=1 for bias_grad to accumulate gradients // use_bias = True if (bias_grad_ptr != NULL) { @@ -515,7 +731,7 @@ void backward_kernel(LinearMeta const *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - // Compute data gradiant + // Compute data gradient // NOTE: we use alpha=1 for input_grad to accumulate gradients if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp new file mode 100644 index 0000000000..c3c2cce3cf --- /dev/null +++ b/src/ops/kernels/lora_linear_kernels.cpp @@ -0,0 +1,576 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/utils/hip_helper.h" +#include +#include +#include + +namespace FlexFlow { + +LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) + : OpMeta(handler, li) { + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; +} + +LoraLinearMeta::~LoraLinearMeta(void) {} + +namespace Kernels { +namespace LoraLinear { + +void init_kernel_wrapper(LoraLinearMeta *m, int seed) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + if (m->input_type[0] == DT_FLOAT) { + Internal::init_kernel(m, seed, stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::init_kernel(m, seed, stream); + } else { + assert(false && "Unsupported data type"); + } +} + +void inference_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +namespace Internal { + +template +void init_kernel(LoraLinearMeta *m, int seed, hipStream_t stream) { + // Initialize generator + std::mt19937 gen(seed); + + // Get handle to weights by iterating over m->model_state to get each + // LoraLinearWeight object + for (auto &model_state : m->model_state) { + LoraLinearWeight weight = model_state.second.weights; + int w0_num_elements = weight.rank * weight.in_dim; + int w1_num_elements = weight.rank * weight.out_dim; + + // LoRA_A weight: [in_dim, rank] + float stdv_lora_a = 1.0f / sqrt(weight.in_dim); + std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); + std::vector
lora_a_random_init(w0_num_elements); + for (auto &num : lora_a_random_init) { + float num_float = dis_lora_a(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(hipMemcpyAsync(static_cast
(weight.w0_ptr), + lora_a_random_init.data(), + w0_num_elements * sizeof(DT), + hipMemcpyHostToDevice, + stream)); + + // LoRA_B weight: [rank, out_dim] + float stdv_lora_b = 1.0f / sqrt(weight.rank); + std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); + std::vector lora_b_random_init(w1_num_elements); + for (auto &num : lora_b_random_init) { + float num_float = dis_lora_b(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(hipMemcpyAsync(static_cast
(weight.w1_ptr), + lora_b_random_init.data(), + w1_num_elements * sizeof(DT), + hipMemcpyHostToDevice, + stream)); + } +} + +template +void inference_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->input_type[1]); + hipblasDatatype_t lr_actv_type = output_type; + assert(input_type == output_type); + hipblasDatatype_t weight_type = output_type; + hipblasDatatype_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipDataType compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->input_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + // Assert that we have at most one request that requires peft_bwd + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + void *intermediate_result_ptr = nullptr; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed1 = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + size_t activation_size_needed2 = + data_type_size(m->input_type[1]) * max_peft_tokens * rank; + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + if (activation_size_needed1 > m->allocated_peft_buffer_size1) { + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed1); + m->allocated_peft_buffer_size1 = activation_size_needed1; + } + if (activation_size_needed2 > m->allocated_peft_buffer_size2) { + m->low_rank_activation = + allocator->allocate_instance_untyped(activation_size_needed2); + m->allocated_peft_buffer_size2 = activation_size_needed2; + } + // copy input activation + checkCUDA(hipMemcpyAsync(m->input_activation, + input_ptr + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + intermediate_result_ptr = m->low_rank_activation; + } else { + // use workspace to save intermediate result + assert(m->handle.workSpaceSize >= + data_type_size(m->input_type[1]) * num_peft_tokens * rank); + intermediate_result_ptr = m->handle.workSpace; + } + // buffer = weight_first * input + // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens] + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + rank, + num_peft_tokens, + in_dim, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + input_ptr + first_token_offset * in_dim, + input_type, + in_dim, + &beta, + intermediate_result_ptr, + lr_actv_type, + rank, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + // output = weight_second * buffer + // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens] + // Note that we use alpha in both places since we do + // an in-place update for LoraLinear + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + out_dim, + num_peft_tokens, + rank, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + intermediate_result_ptr, + lr_actv_type, + rank, + &alpha, + output_ptr + first_token_offset * out_dim, + output_type, + out_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } +} + +template +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + DT const *WGrad, + DT *V, + DT *W) { + // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD + CUDA_KERNEL_LOOP(i, count) { + DT gt = WGrad[i] + (DT)weight_decay * W[i]; + if (momentum > 0.0f) { + V[i] = V[i] * (DT)momentum + gt; + if (nesterov) { + gt = gt + (DT)momentum * V[i]; + } else { + gt = V[i]; + } + } + W[i] -= (DT)lr * gt; + } +} + +template +void peft_bwd_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); + hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); + assert(input_type == output_type); + hipblasDatatype_t weight_type = output_type; + hipblasDatatype_t lr_actv_type = output_type; + hipblasDatatype_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipDataType compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + + // Compute LORA_B weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + rank, + out_dim, + num_peft_tokens, + &scaling_constant, + m->low_rank_activation, + lr_actv_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + weight.w1_grad_ptr, + weight_type, + rank, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + + // Compute LORA_B input's (and LORA_A output's) gradient inplace in + // low_rank_activation + { + DT alpha = 1.0f, beta = 0.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + rank, + num_peft_tokens, + out_dim, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + m->low_rank_activation, + lr_actv_type, + rank, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + + // Compute LORA_A weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + in_dim, + rank, + num_peft_tokens, + &alpha, + m->input_activation, + input_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + weight.w0_grad_ptr, + weight_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Compute input gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + if (input_grad_ptr != nullptr) { + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + in_dim, + num_peft_tokens, + rank, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + + if (bc->requestsInfo[i].optimizer_tasks.update_weights) { + LoraOptimizerConfig const *optimizer_config = + m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config; + assert(optimizer_config != nullptr); + assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig)); + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + // Get optimizer config + if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + (LoraSGDOptimizerConfig const *)optimizer_config; + // LoRA_A weight is split in tensor parallelism, so no need to apply + // all-reduce + sgd_update<<>>(w0_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w0_grad_ptr), + static_cast
(weight.w0_v_values_ptr), + static_cast
(weight.w0_ptr)); + // LoRA_B weight is replicated w tensor parallelism, so we need to sync + // and sum first + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]); + checkCUDA(ncclAllReduce(static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_grad_ptr), + w1_num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); + sgd_update<<>>(w1_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_v_values_ptr), + static_cast
(weight.w1_ptr)); + } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) { + assert(false && "Adam optimizer type not implemented yet"); + } else { + assert(false && "Unsupported optimizer type"); + } + } + } +} + +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu new file mode 100644 index 0000000000..5f130782aa --- /dev/null +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -0,0 +1,579 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/utils/cuda_helper.h" +#include +#include + +namespace FlexFlow { + +LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) + : OpMeta(handler, li) { + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; +} + +LoraLinearMeta::~LoraLinearMeta(void) {} + +namespace Kernels { +namespace LoraLinear { + +void init_kernel_wrapper(LoraLinearMeta *m, int seed) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + if (m->input_type[0] == DT_FLOAT) { + Internal::init_kernel(m, seed, stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::init_kernel(m, seed, stream); + } else { + assert(false && "Unsupported data type"); + } +} + +void inference_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +void peft_bwd_kernel_wrapper(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->input_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + in_dim, + out_dim, + stream); + } else if (m->input_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + in_dim, + out_dim, + stream); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed); + // print_tensor((float*)input_ptr, in_dim * batch_size, + // "[LoraLinear:forward:input]"); print_tensor((float*)weight_ptr, + // in_dim + // * out_dim, "[LoraLinear:forward:kernel]"); + // print_tensor((float*)output_ptr, out_dim * batch_size, + // "[LoraLinear:forward:output]"); + } +} + +namespace Internal { + +template +void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) { + // Initialize generator + std::mt19937 gen(seed); + + // Get handle to weights by iterating over m->model_state to get each + // LoraLinearWeight object + for (auto &model_state : m->model_state) { + LoraLinearWeight weight = model_state.second.weights; + int w0_num_elements = weight.rank * weight.in_dim; + int w1_num_elements = weight.rank * weight.out_dim; + + // LoRA_A weight: [in_dim, rank] + float stdv_lora_a = 1.0f / sqrt(weight.in_dim); + std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); + std::vector
lora_a_random_init(w0_num_elements); + for (auto &num : lora_a_random_init) { + float num_float = dis_lora_a(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), + lora_a_random_init.data(), + w0_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); + + // LoRA_B weight: [rank, out_dim] + float stdv_lora_b = 1.0f / sqrt(weight.rank); + std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); + std::vector lora_b_random_init(w1_num_elements); + for (auto &num : lora_b_random_init) { + float num_float = dis_lora_b(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), + lora_b_random_init.data(), + w1_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); + } +} + +template +void inference_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + DT alpha = 1.0f, beta = 0.0f; + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); + cudaDataType_t lr_actv_type = output_type; + assert(input_type == output_type); + cudaDataType_t weight_type = output_type; + cudaDataType_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->input_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + // Assert that we have at most one request that requires peft_bwd + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + void *intermediate_result_ptr = nullptr; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed1 = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + size_t activation_size_needed2 = + data_type_size(m->input_type[1]) * max_peft_tokens * rank; + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + if (activation_size_needed1 > m->allocated_peft_buffer_size1) { + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed1); + m->allocated_peft_buffer_size1 = activation_size_needed1; + } + if (activation_size_needed2 > m->allocated_peft_buffer_size2) { + m->low_rank_activation = + allocator->allocate_instance_untyped(activation_size_needed2); + m->allocated_peft_buffer_size2 = activation_size_needed2; + } + // copy input activation + checkCUDA(cudaMemcpyAsync(m->input_activation, + input_ptr + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * + num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + intermediate_result_ptr = m->low_rank_activation; + } else { + // use workspace to save intermediate result + assert(m->handle.workSpaceSize >= + data_type_size(m->input_type[1]) * num_peft_tokens * rank); + intermediate_result_ptr = m->handle.workSpace; + } + // buffer = weight_first * input + // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens] + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + rank, + num_peft_tokens, + in_dim, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + input_ptr + first_token_offset * in_dim, + input_type, + in_dim, + &beta, + intermediate_result_ptr, + lr_actv_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // output = weight_second * buffer + // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens] + // Note that we use alpha in both places since we do + // an in-place update for LoraLinear + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_dim, + num_peft_tokens, + rank, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + intermediate_result_ptr, + lr_actv_type, + rank, + &alpha, + output_ptr + first_token_offset * out_dim, + output_type, + out_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } +} + +template +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + DT const *WGrad, + DT *V, + DT *W) { + // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD + CUDA_KERNEL_LOOP(i, count) { + DT gt = WGrad[i] + (DT)weight_decay * W[i]; + if (momentum > 0.0f) { + V[i] = V[i] * (DT)momentum + gt; + if (nesterov) { + gt = gt + (DT)momentum * V[i]; + } else { + gt = V[i]; + } + } + W[i] -= (DT)lr * gt; + } +} + +template +void peft_bwd_kernel(LoraLinearMeta *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int in_dim, + int out_dim, + ffStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); + cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); + assert(input_type == output_type); + cudaDataType_t weight_type = output_type; + cudaDataType_t lr_actv_type = output_type; + cudaDataType_t compute_type = output_type; + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = output_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + LoraLinearWeight weight = + m->model_state[bc->requestsInfo[i].peft_model_id].weights; + int rank = weight.rank; + float lora_alpha = + m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; + DT scaling_constant = (DT)(lora_alpha / rank); + + // Compute LORA_B weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + rank, + out_dim, + num_peft_tokens, + &scaling_constant, + m->low_rank_activation, + lr_actv_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + weight.w1_grad_ptr, + weight_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + // Compute LORA_B input's (and LORA_A output's) gradient inplace in + // low_rank_activation + { + DT alpha = 1.0f, beta = 0.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + rank, + num_peft_tokens, + out_dim, + &scaling_constant, + weight.w1_ptr, + weight_type, + rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + m->low_rank_activation, + lr_actv_type, + rank, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + // Compute LORA_A weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_T, + in_dim, + rank, + num_peft_tokens, + &alpha, + m->input_activation, + input_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + weight.w0_grad_ptr, + weight_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + // Compute input gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + if (input_grad_ptr != nullptr) { + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + checkCUDA(cublasGemmEx(m->handle.blas, + CUBLAS_OP_N, + CUBLAS_OP_N, + in_dim, + num_peft_tokens, + rank, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + m->low_rank_activation, + lr_actv_type, + rank, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + + if (bc->requestsInfo[i].optimizer_tasks.update_weights) { + LoraOptimizerConfig const *optimizer_config = + m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config; + assert(optimizer_config != nullptr); + assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig)); + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + // Get optimizer config + if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + (LoraSGDOptimizerConfig const *)optimizer_config; + // LoRA_A weight is split in tensor parallelism, so no need to apply + // all-reduce + sgd_update<<>>(w0_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w0_grad_ptr), + static_cast
(weight.w0_v_values_ptr), + static_cast
(weight.w0_ptr)); + // LoRA_B weight is replicated w tensor parallelism, so we need to sync + // and sum first +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]); + checkCUDA(ncclAllReduce(static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_grad_ptr), + w1_num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif + sgd_update<<>>(w1_num_elements, + sgd_config->lr, + sgd_config->weight_decay, + sgd_config->momentum, + sgd_config->nesterov, + static_cast
(weight.w1_grad_ptr), + static_cast
(weight.w1_v_values_ptr), + static_cast
(weight.w1_ptr)); + } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) { + assert(false && "Adam optimizer type not implemented yet"); + } else { + assert(false && "Unsupported optimizer type"); + } + } + } +} + +} // namespace Internal +} // namespace LoraLinear +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/ops/kernels/pool_2d_kernels.cpp b/src/ops/kernels/pool_2d_kernels.cpp index 8af85612ca..b3f20a35dd 100644 --- a/src/ops/kernels/pool_2d_kernels.cpp +++ b/src/ops/kernels/pool_2d_kernels.cpp @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/pool_2d_kernels.h" +#include "flexflow/ops/pool_2d.h" #include "flexflow/utils/hip_helper.h" namespace FlexFlow { -Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) { +Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool) + : OpMeta(handler, pool) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreatePoolingDescriptor(&poolDesc)); diff --git a/src/ops/kernels/pool_2d_kernels.cu b/src/ops/kernels/pool_2d_kernels.cu index b418d20cd3..c236f049ba 100644 --- a/src/ops/kernels/pool_2d_kernels.cu +++ b/src/ops/kernels/pool_2d_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/pool_2d_kernels.h" +#include "flexflow/ops/pool_2d.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) { +Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool) + : OpMeta(handler, pool) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); diff --git a/src/ops/kernels/reshape_kernels.cpp b/src/ops/kernels/reshape_kernels.cpp index b17d95bfea..47f407fd82 100644 --- a/src/ops/kernels/reshape_kernels.cpp +++ b/src/ops/kernels/reshape_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/reshape.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {} +ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape) + : OpMeta(handler, reshape) {} namespace Kernels { namespace Reshape { diff --git a/src/ops/kernels/reshape_kernels.cu b/src/ops/kernels/reshape_kernels.cu index 9786f63815..0a2b01ae52 100644 --- a/src/ops/kernels/reshape_kernels.cu +++ b/src/ops/kernels/reshape_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/ops/kernels/reshape_kernels.h" +#include "flexflow/ops/reshape.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {} +ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape) + : OpMeta(handler, reshape) {} namespace Kernels { namespace Reshape { diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp index 6906556452..016364edfd 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cpp +++ b/src/ops/kernels/residual_rms_norm_kernels.cpp @@ -22,18 +22,16 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; + #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, ResidualRMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; + inplace_residual = rms->inplace_residual; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; num_elements = in_dim * batch_size; @@ -47,12 +45,14 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { reserveInst.destroy(); } } + namespace Kernels { namespace ResidualRMSNorm { @@ -78,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -87,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -109,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; X_out[index] = X1[index] + X2[index]; sum += (static_cast(X_out[index]) * static_cast(X_out[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -128,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); - output[index] = Y[index] * weights[index % N]; + Y[index] = static_cast(static_cast(X_out[index]) * + static_cast(rms[i])); + output[index] = static_cast(static_cast(Y[index]) * + static_cast(weights[index % N])); } } @@ -144,19 +138,10 @@ void forward_kernel(ResidualRMSNormMeta const *m, T *residual_output_ptr, T *output_ptr, hipStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualRMSNormFusedForwardKernel), - num_blocks, - num_threads, + m->batch_size, + std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream, m->in_dim, @@ -178,7 +163,57 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + assert(input1.data_type == input2.data_type); + assert(output.data_type == input1.data_type); + assert(weight.data_type == output.data_type); + assert(residual_output.data_type == output.data_type); + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); @@ -211,6 +246,67 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, assert(false && "Unsupported data type"); } + // save input activation if needed for PEFT. This must be done after the + // forward kernel since that's where we add the residual + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + residual_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + residual_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -222,6 +318,288 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ float ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dX1_residual, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX1, + T *dX2, + bool reset_input_grad1, + bool reset_input_grad2) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad1) { + dX1[index] = static_cast(dX_val); + } else { + dX1[index] = dX1_residual[index] + static_cast(dX_val); + } + if (reset_input_grad2) { + dX2[index] = static_cast(dX1[index]); + } else { + dX2[index] += static_cast(dX1[index]); + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(ResidualRMSNormMeta const *m, + T const *output_grad_ptr, + T const *residual_output_rms_input_ptr, + T *residual_input0_grad_ptr, + T *residual_input1_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + hipStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel<<>>( + N, + nullptr, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + residual_output_rms_input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +template +void peft_bwd_kernel(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_0_ptr, + T const *output_grad_1_ptr, + T *input_grad_0_ptr, + T *input_grad_1_ptr, + T const *weight_ptr, + hipStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->in_dim; + + T const *residual_output_rms_input_ptr = + static_cast(m->input_activation); + + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel + <<>>( + N, + output_grad_0_ptr, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_0_ptr, + input_grad_1_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + } +} + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad +*/ +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(output_grad.data_type == residual_output_rms_input.data_type); + assert(residual_output_rms_input.data_type == residual_input0_grad.data_type); + assert(residual_input0_grad.data_type == residual_input1_grad.data_type); + assert(residual_input1_grad.data_type == weight.data_type); + assert(weight.data_type == weight_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + residual_output_rms_input.get_half_ptr(), + residual_input0_grad.get_half_ptr(), + residual_input1_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + residual_output_rms_input.get_float_ptr(), + residual_input0_grad.get_float_ptr(), + residual_input1_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, + GenericTensorAccessorR const &weight) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(output_grad_1.data_type == input_grad_0.data_type); + assert(input_grad_0.data_type == input_grad_1.data_type); + assert(input_grad_1.data_type == weight.data_type); + + if (output_grad_1.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_half_ptr(), + output_grad_1.get_half_ptr(), + input_grad_0.get_half_ptr(), + input_grad_1.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad_1.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_float_ptr(), + output_grad_1.get_float_ptr(), + input_grad_0.get_float_ptr(), + input_grad_1.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 17ac14449b..0d44f0260a 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -24,17 +24,14 @@ namespace FlexFlow { using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, ResidualRMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; + inplace_residual = rms->inplace_residual; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; num_elements = in_dim * batch_size; @@ -48,6 +45,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { @@ -80,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -89,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -111,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; X_out[index] = X1[index] + X2[index]; sum += (static_cast(X_out[index]) * static_cast(X_out[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -130,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X_out[index]) * static_cast(rms[i]); - output[index] = Y[index] * weights[index % N]; + Y[index] = static_cast(static_cast(X_out[index]) * + static_cast(rms[i])); + output[index] = static_cast(static_cast(Y[index]) * + static_cast(weights[index % N])); } } @@ -147,26 +139,17 @@ void forward_kernel(ResidualRMSNormMeta const *m, T *output_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - ResidualRMSNormFusedForwardKernel - <<>>(m->in_dim, - m->eps, - input1_ptr, - input2_ptr, - residual_output_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + <<batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>( + m->in_dim, + m->eps, + input1_ptr, + input2_ptr, + residual_output_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); } void forward_kernel_wrapper(ResidualRMSNormMeta const *m, @@ -219,6 +202,401 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, } } +void inference_kernel_wrapper(ResidualRMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &residual_output, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(input1.data_type == input2.data_type); + assert(output.data_type == input1.data_type); + assert(weight.data_type == output.data_type); + assert(residual_output.data_type == output.data_type); + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input1.get_half_ptr(), + input2.get_half_ptr(), + weight.get_half_ptr(), + residual_output.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input1.get_float_ptr(), + input2.get_float_ptr(), + weight.get_float_ptr(), + residual_output.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + // save input activation if needed for PEFT. This must be done after the + // forward kernel since that's where we add the residual + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + residual_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + residual_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ float ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dX1_residual, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX1, + T *dX2, + bool reset_input_grad1, + bool reset_input_grad2) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad1) { + dX1[index] = static_cast(dX_val); + } else { + dX1[index] = dX1_residual[index] + static_cast(dX_val); + } + if (reset_input_grad2) { + dX2[index] = static_cast(dX1[index]); + } else { + dX2[index] += static_cast(dX1[index]); + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(ResidualRMSNormMeta const *m, + T const *output_grad_ptr, + T const *residual_output_rms_input_ptr, + T *residual_input0_grad_ptr, + T *residual_input1_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + cudaStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel<<>>( + N, + nullptr, + output_grad_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + residual_input0_grad_ptr, + residual_input1_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + residual_output_rms_input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +template +void peft_bwd_kernel(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_0_ptr, + T const *output_grad_1_ptr, + T *input_grad_0_ptr, + T *input_grad_1_ptr, + T const *weight_ptr, + cudaStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->in_dim; + + T const *residual_output_rms_input_ptr = + static_cast(m->input_activation); + + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel + <<>>( + N, + output_grad_0_ptr, + output_grad_1_ptr, + residual_output_rms_input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_0_ptr, + input_grad_1_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1]); + } +} + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad +*/ +void backward_kernel_wrapper( + ResidualRMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &residual_output_rms_input, + GenericTensorAccessorW const &residual_input0_grad, + GenericTensorAccessorW const &residual_input1_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(output_grad.data_type == residual_output_rms_input.data_type); + assert(residual_output_rms_input.data_type == residual_input0_grad.data_type); + assert(residual_input0_grad.data_type == residual_input1_grad.data_type); + assert(residual_input1_grad.data_type == weight.data_type); + assert(weight.data_type == weight_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + residual_output_rms_input.get_half_ptr(), + residual_input0_grad.get_half_ptr(), + residual_input1_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + residual_output_rms_input.get_float_ptr(), + residual_input0_grad.get_float_ptr(), + residual_input1_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad_0, + GenericTensorAccessorR const &output_grad_1, + GenericTensorAccessorW const &input_grad_0, + GenericTensorAccessorW const &input_grad_1, + GenericTensorAccessorR const &weight) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(output_grad_1.data_type == input_grad_0.data_type); + assert(input_grad_0.data_type == input_grad_1.data_type); + assert(input_grad_1.data_type == weight.data_type); + + if (output_grad_1.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_half_ptr(), + output_grad_1.get_half_ptr(), + input_grad_0.get_half_ptr(), + input_grad_1.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad_1.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + m->reset_input_grads[0] ? nullptr + : output_grad_0.get_float_ptr(), + output_grad_1.get_float_ptr(), + input_grad_0.get_float_ptr(), + input_grad_1.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + } // namespace ResidualRMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 24ab7051e6..4158628005 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -23,16 +23,12 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; @@ -47,12 +43,14 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } RMSNormMeta::~RMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { reserveInst.destroy(); } } + namespace Kernels { namespace RMSNorm { @@ -78,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -87,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -107,16 +103,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -124,10 +115,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rms[i]); + Y[index] = static_cast(X[index]) * static_cast(rms[i]); output[index] = Y[index] * weights[index % N]; } } @@ -138,19 +128,10 @@ void forward_kernel(RMSNormMeta const *m, T const *weight_ptr, T *output_ptr, hipStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormFusedForwardKernel), - num_blocks, - num_threads, + m->batch_size, + std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream, m->in_dim, @@ -204,6 +185,363 @@ void forward_kernel_wrapper(RMSNormMeta const *m, } } +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + assert(output.data_type == input.data_type); + assert(weight.data_type == output.data_type); + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + + if (input.data_type == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (input.data_type == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ T ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX, + bool reset_input_grad) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad) { + dX[index] = dX_val; + } else { + dX[index] += dX_val; + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(RMSNormMeta const *m, + T const *output_grad_ptr, + T const *input_ptr, + T *input_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + hipStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + N, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + m->in_dim, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBackwardCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(input_grad.data_type == input.data_type); + assert(weight_grad.data_type == weight.data_type); + assert(output_grad.data_type == input.data_type); + assert(weight.data_type == output_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +template +void peft_bwd_kernel(RMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *weight_ptr, + hipStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel), + M, + std::min(N, CUDA_NUM_THREADS), + 0, + stream, + m->in_dim, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + } +} + +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + assert(input_grad.data_type == output_grad.data_type); + assert(output_grad.data_type == weight.data_type); + + if (output_grad.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 7c9f4a9f98..dd6ada864d 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -24,16 +24,12 @@ namespace FlexFlow { using Legion::coord_t; #define C10_WARP_SIZE 32 -constexpr int kCUDABlockReduceNumThreads = 512; -constexpr int kCUDANumThreads = 256; RMSNormMeta::RMSNormMeta(FFHandler handler, RMSNorm const *rms, MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, rms) { eps = rms->eps; - alpha = 1.0f; - beta = 0.0f; in_dim = rms->data_dim; batch_size = rms->effective_batch_size; @@ -48,6 +44,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); + allocated_peft_buffer_size = 0; } RMSNormMeta::~RMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { @@ -96,66 +93,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - -#ifdef DEADCODE -template -__global__ void - RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) { - __shared__ float v_shared[C10_WARP_SIZE]; - long long const i = blockIdx.x; - float sum = 0.0f; - for (long long j = threadIdx.x; j < N; j += blockDim.x) { - long long const index = i * N + j; - sum += (static_cast(X[index]) * static_cast(X[index])); - } - sum = BlockReduceSum(sum, - v_shared); // use BlockReduceSum() to sum X_ij^2 - - if (threadIdx.x == 0) { - rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); - } -} - -template -__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rstd[i]); - } -} - -template -__global__ void elewise_apply_weights(int64_t batch_size, - int64_t in_dim, - T const *norm, - T const *weights, - T *output) { - CUDA_KERNEL_LOOP(i, batch_size * in_dim) { - output[i] = norm[i] * weights[i % in_dim]; - } -} -#endif - template __global__ void RMSNormFusedForwardKernel(int64_t N, float eps, @@ -167,16 +104,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __shared__ float v_shared[C10_WARP_SIZE]; int64_t const i = blockIdx.x; float sum = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { int64_t const index = i * N + j; sum += (static_cast(X[index]) * static_cast(X[index])); } - sum = BlockReduceSum( - sum, - v_shared, - min(blockDim.x, - kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2 + sum = BlockReduceSum(sum, v_shared); if (threadIdx.x == 0) { rms[i] = static_cast(rsqrt((sum / static_cast(N)) + eps)); @@ -184,10 +116,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N, __syncthreads(); - using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; - Y[index] = static_cast(X[index]) * static_cast(rms[i]); + Y[index] = static_cast(X[index]) * static_cast(rms[i]); output[index] = Y[index] * weights[index % N]; } } @@ -199,24 +130,15 @@ void forward_kernel(RMSNormMeta const *m, T *output_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - RMSNormFusedForwardKernel - <<>>(m->in_dim, - m->eps, - input_ptr, - static_cast(m->rms_ptr), - static_cast(m->norm_ptr), - weight_ptr, - output_ptr); + <<batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>( + m->in_dim, + m->eps, + input_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + weight_ptr, + output_ptr); } void forward_kernel_wrapper(RMSNormMeta const *m, @@ -261,6 +183,346 @@ void forward_kernel_wrapper(RMSNormMeta const *m, } } +void inference_kernel_wrapper(RMSNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + assert(output.data_type == input.data_type); + assert(weight.data_type == output.data_type); + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + + if (input.data_type == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (input.data_type == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (output.data_type == DT_HALF) { + forward_kernel(m, + input.get_half_ptr(), + weight.get_half_ptr(), + output.get_half_ptr(), + stream); + } else if (output.data_type == DT_FLOAT) { + forward_kernel(m, + input.get_float_ptr(), + weight.get_float_ptr(), + output.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed); + } +} + +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) { + __shared__ T ds_storage[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + float ds = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + int const index = i * N + j; + ds += static_cast(dY[index]) * static_cast(X[index]) * + static_cast(gamma[j]); + } + ds = BlockReduceSum(ds, ds_storage); + if (threadIdx.x == 0) { + float const c2_val = + -ds * + (static_cast(rrms[i]) * static_cast(rrms[i]) * + static_cast(rrms[i])) / + static_cast((int)N); + c2[i] = static_cast(c2_val); + } +} + +template +__global__ void RMSNormBackwardCUDAKernel(int64_t N, + T const *dY, + T const *X, + T const *gamma, + T const *c1, + T const *c2, + T *dX, + bool reset_input_grad) { + const int64_t i = blockIdx.x; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + float const dX_val = + static_cast(c1[i]) * static_cast(dY[index]) * + static_cast(gamma[j]) + + static_cast(c2[i]) * static_cast(X[index]); + if (reset_input_grad) { + dX[index] = dX_val; + } else { + dX[index] += dX_val; + } + } +} + +// Assume the batch size will not be very large, direct implementation is the +// most efficient one. +template +__global__ void GammaBackwardCUDAKernel( + int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T sum1 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dY[index] * X[index] * rrms[i]; + } + dg[j] = sum1; + } +} + +template +void backward_kernel(RMSNormMeta const *m, + T const *output_grad_ptr, + T const *input_ptr, + T *input_grad_ptr, + T const *weight_ptr, + T *weight_grad_ptr, + cudaStream_t stream) { + int M = m->batch_size; + int N = m->in_dim; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + + RMSNormBackwardCUDAKernel<<>>( + m->in_dim, + output_grad_ptr, + input_ptr, + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + GammaBackwardCUDAKernel<<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->rms_ptr), + weight_grad_ptr); +} + +void backward_kernel_wrapper(RMSNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight, + GenericTensorAccessorW const &weight_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(input_grad.data_type == input.data_type); + assert(weight_grad.data_type == weight.data_type); + assert(output_grad.data_type == input.data_type); + assert(weight.data_type == output_grad.data_type); + + if (output_grad.data_type == DT_HALF) { + backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + weight_grad.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +template +void peft_bwd_kernel(RMSNormMeta const *m, + BatchConfig const *bc, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *weight_ptr, + cudaStream_t stream) { + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + + int M = bc->requestsInfo[i].num_tokens_in_batch; + int N = m->num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr)); + RMSNormBackwardCUDAKernel + <<>>( + m->in_dim, + output_grad_ptr, + static_cast(m->input_activation), + weight_ptr, + static_cast(m->rms_ptr), + static_cast(m->norm_ptr), + input_grad_ptr, + m->reset_input_grads[0]); + } +} + +void peft_bwd_kernel_wrapper(RMSNormMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &weight) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + assert(input_grad.data_type == output_grad.data_type); + assert(output_grad.data_type == weight.data_type); + + if (output_grad.data_type == DT_HALF) { + peft_bwd_kernel(m, + bc, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + weight.get_half_ptr(), + stream); + } else if (output_grad.data_type == DT_FLOAT) { + peft_bwd_kernel(m, + bc, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + } // namespace RMSNorm } // namespace Kernels } // namespace FlexFlow diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index 89c9f14a01..fa31c5adff 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -25,13 +25,13 @@ using Legion::Domain; SoftmaxMeta::SoftmaxMeta(FFHandler handler, Softmax const *softmax, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, softmax) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + inputTensor, input_domain, softmax->data_type)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); - checkCUDNN( - cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( + outputTensor, input_domain, softmax->data_type)); dim = softmax->dim; profiling = softmax->profiling; inference_debugging = softmax->inference_debugging; @@ -41,20 +41,26 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, namespace Kernels { namespace Softmax { -template void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - hipEvent_t t_start, t_end; if (m->profiling) { checkCUDA(hipEventCreate(&t_start)); checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + if (m->output_type[0] == DT_FLOAT) { + Internal::forward_kernel( + m, input.get_float_ptr(), output.get_float_ptr(), stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::forward_kernel( + m, input.get_half_ptr(), output.get_half_ptr(), stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -70,11 +76,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, } } -template void backward_kernel_wrapper(SoftmaxMeta const *m, - DT *input_grad_ptr, - DT const *output_grad_ptr, - size_t num_elements) { + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -84,8 +88,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } - Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + assert(input_grad.domain == output_grad.domain); + if (m->output_type[0] == DT_FLOAT) { + Internal::backward_kernel(m, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::backward_kernel(m, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + output_grad.domain.get_volume(), + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -101,21 +119,112 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); -template void forward_kernel_wrapper(SoftmaxMeta const *m, - half const *input_ptr, - half *output_ptr); - -template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements); -template void backward_kernel_wrapper(SoftmaxMeta const *m, - half *input_grad_ptr, - half const *output_grad_ptr, - size_t num_elements); +void inference_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + bool is_last_op, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(hipMemcpyAsync(output_grad.get_float_ptr(), + output.get_float_ptr(), + output.domain.get_volume() * sizeof(float), + hipMemcpyDeviceToDevice, + stream)); + } + } else if (m->output_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(hipMemcpyAsync(output_grad.get_half_ptr(), + output.get_half_ptr(), + output.domain.get_volume() * sizeof(half), + hipMemcpyDeviceToDevice, + stream)); + } + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + num_classes, + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + num_classes, + stream); + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} namespace Internal { template @@ -138,7 +247,8 @@ void forward_kernel(SoftmaxMeta const *m, } template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, hipStream_t stream) { @@ -149,6 +259,116 @@ void backward_kernel(DT *input_grad_ptr, stream)); } +template +void inference_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int num_classes, + hipStream_t stream) { + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor, + cudnn_data_type, + bc->num_active_tokens(), + num_classes, + 1, + 1)); + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &alpha, + m->outputTensor, + input_ptr, + &beta, + m->outputTensor, + output_ptr, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); +} + +template +__global__ void sparse_categorical_crossentropy_loss_peft_backward( + DT *input_grad, + DT const *output_grad, + BatchConfig::TokenId const *token_ids, + int num_tokens, + int num_classes) { + CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { + int class_idx = i % num_classes; + int token_idx = i / num_classes; + input_grad[i] = output_grad[i]; + if (class_idx == token_ids[token_idx]) { + input_grad[i] = input_grad[i] - (DT)1.0f; + } + } +} + +template +void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + hipStream_t stream) { + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (!bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); + // ignore last token + checkCUDA(hipMemsetAsync(input_grad_ptr + + (tokens_previous_requests + + bc->requestsInfo[i].num_tokens_in_batch - 1) * + num_classes, + 0, + num_classes * sizeof(DT), + stream)); + checkCUDA(hipMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + hipMemcpyHostToDevice, + stream)); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(sparse_categorical_crossentropy_loss_peft_backward
), + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream, + input_grad_ptr + tokens_previous_requests * num_classes, + output_grad_ptr + tokens_previous_requests * num_classes, + static_cast(m->handle.workSpace), + num_bwd_tokens, + num_classes); + // scale + hipLaunchKernelGGL(HIP_KERNEL_NAME(scale_kernel
), + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream, + input_grad_ptr + tokens_previous_requests * num_classes, + num_bwd_tokens * num_classes, + DT(0.0), + scale_factor); + + tokens_previous_requests += num_bwd_tokens + 1; + } + assert(tokens_previous_requests == bc->num_active_tokens()); +} + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index e47006cc9d..16f1219bf6 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -24,7 +24,7 @@ using Legion::Domain; SoftmaxMeta::SoftmaxMeta(FFHandler handler, Softmax const *softmax, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, softmax) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( inputTensor, input_domain, softmax->data_type)); @@ -40,10 +40,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, namespace Kernels { namespace Softmax { -template void forward_kernel_wrapper(SoftmaxMeta const *m, - DT const *input_ptr, - DT *output_ptr) { + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); cudaEvent_t t_start, t_end; @@ -52,7 +51,15 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::forward_kernel(m, input_ptr, output_ptr, stream); + if (m->output_type[0] == DT_FLOAT) { + Internal::forward_kernel( + m, input.get_float_ptr(), output.get_float_ptr(), stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::forward_kernel( + m, input.get_half_ptr(), output.get_half_ptr(), stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -68,11 +75,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m, } } -template void backward_kernel_wrapper(SoftmaxMeta const *m, - DT *input_grad_ptr, - DT const *output_grad_ptr, - size_t num_elements) { + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -82,8 +87,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } - Internal::backward_kernel( - input_grad_ptr, output_grad_ptr, num_elements, stream); + assert(input_grad.domain == output_grad.domain); + if (m->output_type[0] == DT_FLOAT) { + Internal::backward_kernel(m, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + output_grad.domain.get_volume(), + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::backward_kernel(m, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + output_grad.domain.get_volume(), + stream); + } else { + assert(false && "Unsupported data type"); + } if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -99,21 +118,113 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -template void forward_kernel_wrapper(SoftmaxMeta const *m, - float const *input_ptr, - float *output_ptr); -template void forward_kernel_wrapper(SoftmaxMeta const *m, - half const *input_ptr, - half *output_ptr); - -template void backward_kernel_wrapper(SoftmaxMeta const *m, - float *input_grad_ptr, - float const *output_grad_ptr, - size_t num_elements); -template void backward_kernel_wrapper(SoftmaxMeta const *m, - half *input_grad_ptr, - half const *output_grad_ptr, - size_t num_elements); +void inference_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + bool is_last_op, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::inference_kernel(m, + bc, + input.get_float_ptr(), + output.get_float_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(), + output.get_float_ptr(), + output.domain.get_volume() * sizeof(float), + cudaMemcpyDeviceToDevice, + stream)); + } + } else if (m->output_type[0] == DT_HALF) { + Internal::inference_kernel(m, + bc, + input.get_half_ptr(), + output.get_half_ptr(), + num_classes, + stream); + if (is_last_op) { + checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(), + output.get_half_ptr(), + output.domain.get_volume() * sizeof(half), + cudaMemcpyDeviceToDevice, + stream)); + } + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + +void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + if (m->output_type[0] == DT_FLOAT) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_float_ptr(), + output_grad.get_float_ptr(), + num_classes, + stream); + } else if (m->output_type[0] == DT_HALF) { + Internal::peft_bwd_kernel(m, + bc, + input_grad.get_half_ptr(), + output_grad.get_half_ptr(), + num_classes, + stream); + } else { + assert(false && "Unsupported data type"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + // print_tensor(acc_input.ptr, acc_input.rect.volume(), + // "[Softmax:forward:input]"); print_tensor(acc_output.ptr, + // acc_output.rect.volume(), "[Softmax:forward:output]"); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + log_measure.debug( + "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed); + } +} + namespace Internal { template void forward_kernel(SoftmaxMeta const *m, @@ -135,7 +246,8 @@ void forward_kernel(SoftmaxMeta const *m, } template -void backward_kernel(DT *input_grad_ptr, +void backward_kernel(SoftmaxMeta const *m, + DT *input_grad_ptr, DT const *output_grad_ptr, size_t num_elements, cudaStream_t stream) { @@ -146,6 +258,115 @@ void backward_kernel(DT *input_grad_ptr, stream)); } +template +void inference_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT const *input_ptr, + DT *output_ptr, + int num_classes, + cudaStream_t stream) { + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + + float alpha = 1.0f, beta = 0.0f; + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor, + CUDNN_TENSOR_NCHW, + cudnn_data_type, + bc->num_active_tokens(), + num_classes, + 1, + 1)); + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->outputTensor, + input_ptr, + &beta, + m->outputTensor, + output_ptr)); +} + +template +__global__ void sparse_categorical_crossentropy_loss_peft_backward( + DT *input_grad, + DT const *output_grad, + BatchConfig::TokenId const *token_ids, + int num_tokens, + int num_classes) { + CUDA_KERNEL_LOOP(i, num_tokens * num_classes) { + int class_idx = i % num_classes; + int token_idx = i / num_classes; + input_grad[i] = output_grad[i]; + if (class_idx == token_ids[token_idx]) { + input_grad[i] = input_grad[i] - (DT)1.0f; + } + } +} + +template +void peft_bwd_kernel(SoftmaxMeta const *m, + BatchConfig const *bc, + DT *input_grad_ptr, + DT const *output_grad_ptr, + int num_classes, + cudaStream_t stream) { + BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (!bc->requestsInfo[i].peft_bwd) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_bwd_tokens; j++) { + token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id; + } + + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); + // ignore last token + checkCUDA(cudaMemsetAsync( + input_grad_ptr + (tokens_previous_requests + + bc->requestsInfo[i].num_tokens_in_batch - 1) * + num_classes, + 0, + num_classes * sizeof(DT), + stream)); + checkCUDA(cudaMemcpyAsync(m->handle.workSpace, + token_ids, + sizeof(BatchConfig::TokenId) * num_bwd_tokens, + cudaMemcpyHostToDevice, + stream)); + sparse_categorical_crossentropy_loss_peft_backward<<< + GET_BLOCKS(num_bwd_tokens * num_classes), + CUDA_NUM_THREADS, + 0, + stream>>>( + input_grad_ptr + tokens_previous_requests * num_classes, + output_grad_ptr + tokens_previous_requests * num_classes, + static_cast(m->handle.workSpace), + num_bwd_tokens, + num_classes); + // scale + scale_kernel<<>>(input_grad_ptr + + tokens_previous_requests * num_classes, + num_bwd_tokens * num_classes, + DT(0.0), + scale_factor); + + tokens_previous_requests += num_bwd_tokens + 1; + } + assert(tokens_previous_requests == bc->num_active_tokens()); +} + } // namespace Internal } // namespace Softmax } // namespace Kernels diff --git a/src/ops/kernels/transpose_kernels.cpp b/src/ops/kernels/transpose_kernels.cpp index 49a7d827f5..199e1cd0c1 100644 --- a/src/ops/kernels/transpose_kernels.cpp +++ b/src/ops/kernels/transpose_kernels.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/transpose.h" #include "flexflow/utils/hip_helper.h" #include @@ -22,6 +23,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; +TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose) + : OpMeta(handler, transpose) {} + struct TransposeStrides { int num_dim; int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM], diff --git a/src/ops/kernels/transpose_kernels.cu b/src/ops/kernels/transpose_kernels.cu index b401ff0ba1..18a6e405af 100644 --- a/src/ops/kernels/transpose_kernels.cu +++ b/src/ops/kernels/transpose_kernels.cu @@ -14,6 +14,7 @@ */ #include "flexflow/ops/kernels/transpose_kernels.h" +#include "flexflow/ops/transpose.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -21,6 +22,9 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Domain; +TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose) + : OpMeta(handler, transpose) {} + struct TransposeStrides { int num_dim; int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM], diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc index b19f400eb2..3161987d60 100644 --- a/src/ops/layer_norm.cc +++ b/src/ops/layer_norm.cc @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/model.h" #include "flexflow/utils/hash_utils.h" #include "legion/legion_utilities.h" @@ -56,7 +57,7 @@ LayerNormParams LayerNorm::get_params() const { params.elementwise_affine = this->elementwise_affine; params.eps = this->eps; params.use_bias = this->use_bias; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -561,7 +562,7 @@ void LayerNorm::inference_task(Task const *task, assert(regions.size() == 2); } - LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); + LayerNorm::inference_kernel_wrapper(m, bc, in, out, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); @@ -645,6 +646,104 @@ void LayerNorm::forward_task(Task const *task, LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta); } +Legion::FutureMap + LayerNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // regions[0](I): output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I/O): input_grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + if (elementwise_affine) { + // regions[2](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I/O): input_grad + regions[2](I): gamma +*/ +void LayerNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); + assert(task->regions.size() == regions.size()); + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + + Domain out_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain in_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 3)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + Domain gamma_domain = runtime->get_index_space_domain( + ctx, task->regions[2].region.get_index_space()); + + assert(gamma_domain.get_volume() == m->effective_num_elements); + } else { + assert(regions.size() == 2); + } + LayerNorm::peft_bwd_kernel_wrapper(m, output_grad, input_grad, gamma); +} + void LayerNorm::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -722,55 +821,60 @@ void LayerNorm::backward_task(Task const *task, Runtime *runtime) { LayerNormMeta const *m = *((LayerNormMeta **)task->local_args); assert(task->regions.size() == regions.size()); - float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL; - float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; Domain out_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - out_grad_ptr = helperGetTensorPointerRO( - regions[0], task->regions[0], FID_DATA, ctx, runtime); Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - in_ptr = helperGetTensorPointerRO( - regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain in_grad_domain = runtime->get_index_space_domain( ctx, task->regions[2].region.get_index_space()); - in_grad_ptr = helperGetTensorPointerRW( - regions[2], task->regions[2], FID_DATA, ctx, runtime); assert(in_domain == out_grad_domain); assert(in_domain.get_volume() == m->effective_num_elements * m->effective_batch_size); + if (m->elementwise_affine) { assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + gamma_grad = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[4], + task->regions[4], + FID_DATA, + ctx, + runtime); Domain gamma_domain = runtime->get_index_space_domain( ctx, task->regions[3].region.get_index_space()); - gamma_ptr = helperGetTensorPointerRO( - regions[3], task->regions[3], FID_DATA, ctx, runtime); Domain gamma_grad_domain = runtime->get_index_space_domain( ctx, task->regions[4].region.get_index_space()); - gamma_grad_ptr = helperGetTensorPointerRW( - regions[4], task->regions[4], FID_DATA, ctx, runtime); if (m->use_bias) { Domain beta_grad_domain = runtime->get_index_space_domain( ctx, task->regions[5].region.get_index_space()); - beta_grad_ptr = helperGetTensorPointerRW( - regions[5], task->regions[5], FID_DATA, ctx, runtime); + beta_grad = helperGetGenericTensorAccessorRW(m->output_type[0], + regions[5], + task->regions[5], + FID_DATA, + ctx, + runtime); assert(gamma_domain == beta_grad_domain); } - assert(gamma_domain == gamma_grad_domain); - assert(gamma_domain.get_volume() == m->effective_num_elements); } else { assert(regions.size() == 3); } - - LayerNorm::backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); + LayerNorm::backward_kernel_wrapper( + m, output_grad, input, input_grad, gamma, gamma_grad, beta_grad); } bool LayerNorm::measure_operator_cost(Simulator *sim, @@ -785,7 +889,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, } Domain input_domain = sub_input.get_domain(); Domain output_domain = sub_output.get_domain(); - LayerNormMeta *m = sim->layernorm_meta; + MemoryAllocator gpu_mem_allocator(sim->memory); + LayerNormMeta *m = new LayerNormMeta(sim->handler, this, gpu_mem_allocator); sim->free_all(); float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); @@ -821,16 +926,24 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, if (sim->computationMode == COMP_MODE_TRAINING) { float *in_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + GenericTensorAccessorW in_grad_acc( + inputs[0]->data_type, input_domain, in_grad_ptr); assert(in_grad_ptr != NULL); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *out_grad_ptr = NULL; out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorR out_grad_acc( + outputs[0]->data_type, output_domain, out_grad_ptr); assert(out_grad_ptr != NULL); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL; + GenericTensorAccessorW gamma_grad_acc( + outputs[0]->data_type, output_domain, gamma_grad_ptr); + GenericTensorAccessorW beta_grad_acc( + outputs[0]->data_type, output_domain, beta_grad_ptr); out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) || (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) && @@ -842,13 +955,13 @@ bool LayerNorm::measure_operator_cost(Simulator *sim, } backward = [=] { - backward_kernel_wrapper(m, - out_grad_ptr, - in_ptr, - in_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr); + backward_kernel_wrapper(m, + out_grad_acc, + input1_acc, + in_grad_acc, + gamma_acc, + gamma_grad_acc, + beta_grad_acc); }; } diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 07dbdb3dfb..27d314e21e 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -14,6 +14,7 @@ */ #include "flexflow/ops/layer_norm.h" +#include "flexflow/ffconst_utils.h" #include "flexflow/utils/hip_helper.h" #include @@ -27,21 +28,37 @@ constexpr int kColwiseReduceTileSize = 32; LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; + use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; effective_num_elements = ln->effective_num_elements; - use_bias = ln->use_bias; + profiling = ln->profiling; + inference_debugging = ln->inference_debugging; eps = ln->eps; - checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size)); - checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size)); + DataType data_type = ln->data_type; + size_t totalSize = effective_batch_size * data_type_size(data_type) * 6; + gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); + mean_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + rstd_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + ds_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + db_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + scale_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + bias_ptr = gpu_mem_allocator.allocate_instance_untyped( + data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } -LayerNormMeta::~LayerNormMeta(void) {} +LayerNormMeta::~LayerNormMeta(void) { + if (reserveInst != Realm::RegionInstance::NO_INST) { + reserveInst.destroy(); + } +} template __device__ __forceinline__ T WARP_SHFL_DOWN(T value, @@ -74,7 +91,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < blockDim.x / C10_WARP_SIZE) ? shared[lid] : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -82,8 +99,14 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { } template -__global__ void RowwiseMomentsCUDAKernel( - int64_t N, float eps, T const *X, T *mean, T *rstd) { +__global__ void LayerNormFusedForwardKernel(int64_t N, + float eps, + T const *X, + T *mean, + T *rstd, + T const *gamma, + T const *beta, + T *Y) { __shared__ float m_shared[C10_WARP_SIZE]; __shared__ float v_shared[C10_WARP_SIZE]; const int64_t i = blockIdx.x; @@ -103,18 +126,10 @@ __global__ void RowwiseMomentsCUDAKernel( mean[i] = static_cast(sum1); rstd[i] = static_cast(rsqrt(sum2 + eps)); } -} -template -__global__ void LayerNormForwardCUDAKernel(int64_t N, - T const *X, - T const *mean, - T const *rstd, - T const *gamma, - T const *beta, - T *Y) { + __syncthreads(); + using T_ACC = T; - const int64_t i = blockIdx.x; for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = @@ -135,28 +150,19 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *gamma_ptr, T const *beta_ptr, hipStream_t stream) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel), - m->effective_batch_size, - kCUDABlockReduceNumThreads, - 0, - stream, - m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel), - m->effective_batch_size, - kCUDANumThreads, - 0, - stream, - m->effective_num_elements, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - out_ptr); + + LayerNormFusedForwardKernel + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + out_ptr); } /*static*/ @@ -167,24 +173,154 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, GenericTensorAccessorR const &beta) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } if (m->input_type[0] == DT_FLOAT) { - LayerNorm::forward_kernel(m, - input.get_float_ptr(), - output.get_float_ptr(), - gamma.get_float_ptr(), - m->use_bias ? beta.get_float_ptr() - : nullptr, - stream); + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); } else if (m->input_type[0] == DT_HALF) { - LayerNorm::forward_kernel(m, - input.get_half_ptr(), - output.get_half_ptr(), - gamma.get_half_ptr(), - m->use_bias ? beta.get_half_ptr() : nullptr, - stream); + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); } else { assert(false && "unsupport datatype in layernorm"); } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + } +} + +/*static*/ +void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + } } template @@ -224,7 +360,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, using T_ACC = T; const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < M) { - const T_ACC s = T_ACC(1) / static_cast(N); + const T_ACC s = T_ACC(1) / static_cast((int)N); const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * static_cast(rstd[index]) * static_cast(rstd[index]) * @@ -235,27 +371,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, } } -template -__global__ void LayerNormBackwardCUDAKenrel(int64_t N, - T const *dY, - T const *X, - T const *gamma, - T const *a, - T const *b, - T const *c, - T *dX) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - dX[index] = - static_cast(a[i]) * static_cast(dY[index]) * gamma_v + - b[i] * static_cast(X[index]) + c[i]; - } -} - template __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, int64_t N, @@ -452,116 +567,148 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, hipStream_t stream) { const int64_t M = m->effective_batch_size; const int64_t N = m->effective_num_elements; - hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), - M, - kCUDABlockReduceNumThreads, - 0, - stream, - N, - output_grad_ptr, - input_ptr, - gamma_ptr, - static_cast(m->ds_ptr), - static_cast(m->db_ptr)); + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + input_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), - B, - kCUDANumThreads, - 0, - stream, - M, - N, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - static_cast(m->ds_ptr), - static_cast(m->db_ptr), - static_cast(m->scale_ptr), - static_cast(m->bias_ptr)); - + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); int const warp_size = C10_WARP_SIZE; int const num_threads = 128; const dim3 blocks(M); int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); - hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), - blocks, - num_threads, - nshared, - stream, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - input_grad_ptr, - N); if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { if (M < 512) { // For small batch size, do colwise reduce directly const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), - B, - kCUDANumThreads, - 0, - stream, - M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } else { const int64_t B = (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; constexpr int kThreadX = kColwiseReduceTileSize; constexpr int kThreadY = kColwiseReduceTileSize / 2; - hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), - B, - dim3(kThreadX, kThreadY), - 0, - stream, - M, - N, - output_grad_ptr, - input_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_grad_ptr, - beta_grad_ptr); + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + input_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); } } } /*static*/ template -void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T const *input_ptr, - T *input_grad_ptr, - T const *gamma_ptr, - T *gamma_grad_ptr, - T *beta_grad_ptr) { +void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); +} + +/*static*/ +void LayerNorm::peft_bwd_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + stream); + } else { + assert(m->output_type[0] == DT_HALF); + LayerNorm::peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + stream); + } } -template void - LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - float const *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *gamma_ptr, - float *gamma_grad_ptr, - float *beta_grad_ptr); +/*static*/ +void LayerNorm::backward_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr(), + stream); + } else if (m->output_type[0] == DT_HALF) { + LayerNorm::backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + gamma_grad.get_half_ptr(), + beta_grad.get_half_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } +} -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 44979c48fe..0801d11617 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32; LayerNormMeta::LayerNormMeta(FFHandler handle, LayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; effective_batch_size = ln->effective_batch_size; @@ -50,6 +50,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } LayerNormMeta::~LayerNormMeta(void) { @@ -96,73 +97,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) { return val; } -template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { - int const lid = threadIdx.x % C10_WARP_SIZE; - int const wid = threadIdx.x / C10_WARP_SIZE; - val = WarpReduceSum(val); - __syncthreads(); - if (lid == 0) { - shared[wid] = val; - } - __syncthreads(); - val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE)) - ? shared[lid] - : T(0); - if (wid == 0) { - val = WarpReduceSum(val); - } - return val; -} - -#ifdef DEADCODE -template -__global__ void RowwiseMomentsCUDAKernel( - int64_t N, float eps, T const *X, T *mean, T *rstd) { - __shared__ float m_shared[C10_WARP_SIZE]; - __shared__ float v_shared[C10_WARP_SIZE]; - const int64_t i = blockIdx.x; - float sum1 = 0.0f; - float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - sum1 += static_cast(X[index]); - sum2 += static_cast(X[index]) * static_cast(X[index]); - } - sum1 = BlockReduceSum(sum1, m_shared); - sum2 = BlockReduceSum(sum2, v_shared); - if (threadIdx.x == 0) { - float const scale = float(1) / static_cast(N); - sum1 *= scale; - sum2 = max(sum2 * scale - sum1 * sum1, float(0)); - mean[i] = static_cast(sum1); - rstd[i] = static_cast(rsqrt(sum2 + eps)); - } -} - -template -__global__ void LayerNormForwardCUDAKernel(int64_t N, - T const *X, - T const *mean, - T const *rstd, - T const *gamma, - T const *beta, - T *Y) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - const T_ACC beta_v = - beta == nullptr ? T_ACC(0) : static_cast(beta[j]); - Y[index] = (static_cast(X[index]) - static_cast(mean[i])) * - static_cast(rstd[i]) * gamma_v + - beta_v; - } -} -#endif - template __global__ void LayerNormFusedForwardKernel(int64_t N, float eps, @@ -177,18 +111,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -200,7 +129,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -221,25 +150,18 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m, T const *beta_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - LayerNormFusedForwardKernel - <<>>(m->effective_num_elements, - m->eps, - in_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - out_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + in_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + out_ptr); } /*static*/ @@ -290,6 +212,116 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m, } } +/*static*/ +void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorR const &beta) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->input_type[0] == DT_FLOAT) { + LayerNorm::forward_kernel( + m, + input.get_float_ptr(), + output.get_float_ptr(), + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr, + stream); + } else if (m->input_type[0] == DT_HALF) { + LayerNorm::forward_kernel( + m, + input.get_half_ptr(), + output.get_half_ptr(), + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "unsupport datatype in layernorm"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed); + // print_tensor(in_ptr, 32, "[LayerNorm:forward:input]"); + // print_tensor(out_ptr, 32, "[LayerNorm:forward:output]"); + } +} + template __global__ void ComputeInternalGradientsCUDAKernel( int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { @@ -327,7 +359,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, using T_ACC = T; const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; if (index < M) { - const T_ACC s = T_ACC(1) / static_cast(N); + const T_ACC s = T_ACC(1) / static_cast((int)N); const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * static_cast(rstd[index]) * static_cast(rstd[index]) * @@ -338,27 +370,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, } } -template -__global__ void LayerNormBackwardCUDAKenrel(int64_t N, - T const *dY, - T const *X, - T const *gamma, - T const *a, - T const *b, - T const *c, - T *dX) { - using T_ACC = T; - const int64_t i = blockIdx.x; - for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { - const int64_t index = i * N + j; - const T_ACC gamma_v = - gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); - dX[index] = - static_cast(a[i]) * static_cast(dY[index]) * gamma_v + - b[i] * static_cast(X[index]) + c[i]; - } -} - template __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, int64_t N, @@ -620,44 +631,83 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m, /*static*/ template -void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - T const *output_grad_ptr, - T const *input_ptr, - T *input_grad_ptr, - T const *gamma_ptr, - T *gamma_grad_ptr, - T *beta_grad_ptr) { +void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + N); +} + +/*static*/ +void LayerNorm::peft_bwd_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); if (m->output_type[0] == DT_FLOAT) { - LayerNorm::backward_kernel(m, - output_grad_ptr, - input_ptr, - input_grad_ptr, - gamma_ptr, - gamma_grad_ptr, - beta_grad_ptr, - stream); + LayerNorm::peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + stream); + } else { + assert(m->output_type[0] == DT_HALF); + LayerNorm::peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + stream); } - // }else if(m->output_type[0] == DT_HALF){ - // LayerNorm::backward_kernel(m, - // output_grad_ptr, - // input_ptr, - // input_grad_ptr, - // gamma_ptr, - // gamma_grad_ptr, - // beta_grad_ptr, - // stream); - // } } -template void - LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m, - float const *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *gamma_ptr, - float *gamma_grad_ptr, - float *beta_grad_ptr); +/*static*/ +void LayerNorm::backward_kernel_wrapper( + LayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + if (m->output_type[0] == DT_FLOAT) { + LayerNorm::backward_kernel(m, + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + gamma.get_float_ptr(), + gamma_grad.get_float_ptr(), + beta_grad.get_float_ptr(), + stream); + } else if (m->output_type[0] == DT_HALF) { + LayerNorm::backward_kernel(m, + output_grad.get_half_ptr(), + input.get_half_ptr(), + input_grad.get_half_ptr(), + gamma.get_half_ptr(), + gamma_grad.get_half_ptr(), + beta_grad.get_half_ptr(), + stream); + } else { + assert(false && "Unsupported data type"); + } +} -}; // namespace FlexFlow +} // namespace FlexFlow diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 44b56d623e..20ad762b62 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -498,7 +498,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task, m->add_bias_only_once = linear->add_bias_only_once; m->profiling = linear->profiling; m->inference_debugging = linear->inference_debugging; - m->trainableInputs[0] = linear->trainableInputs[0]; + m->trainable_inputs[0] = linear->trainable_inputs[0]; m->weight_ptr_type = m->input_type[0]; m->quantization_type = linear->quantization_type; m->offload = linear->offload; @@ -632,8 +632,11 @@ void Linear::inference_task(Task const *task, m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + assert((weight.domain.hi()[0] - weight.domain.lo()[0] + 1) == in_dim); + assert((weight.domain.hi()[1] - weight.domain.lo()[1] + 1) == out_dim); + assert(weight.domain.get_volume() == in_dim * out_dim); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); GenericTensorAccessorR bias; if (m->use_bias && !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) { @@ -645,14 +648,15 @@ void Linear::inference_task(Task const *task, runtime); assert(bias.domain.get_volume() == static_cast(out_dim)); } - forward_kernel_wrapper(m, - input.ptr, - output.ptr, - weight.ptr, - bias.ptr, - in_dim, - out_dim, - batch_size); + inference_kernel_wrapper(m, + bc, + input.ptr, + output.ptr, + weight.ptr, + bias.ptr, + in_dim, + out_dim, + batch_size); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -664,6 +668,119 @@ void Linear::inference_task(Task const *task, } Linear::save_inference_tensors_to_file( m, shard_id, bc, {input}, weights_accessors, {output}); + printf("\tin=[%i,%i].T @ w=[%i,%i] -> out=[%i,%i]\n", + in_dim, + bc->num_tokens, + in_dim, + out_dim, + out_dim, + bc->num_tokens); + } +} + +FutureMap Linear::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region, + ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void Linear::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + LinearMeta *m = *((LinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(regions.size() == 3); + assert(task->regions.size() == 3); + if (m->quantization_type == DT_NONE) { + assert(m->input_type[0] == m->weight_type[0]); + } + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + + int num_infr_tokens = bc->num_active_infr_tokens(); + int num_peft_tokens = bc->num_active_peft_tokens(); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Linear::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false, true); + printf("\tw=[%i,%i] @ out_grad=[%i,%i] -> in_grad[%i,%i]\n", + in_dim, + out_dim, + out_dim, + num_peft_tokens, + in_dim, + num_peft_tokens); + } + peft_bwd_kernel_wrapper(m, + input_grad.ptr, + output_grad.ptr, + weight.ptr, + in_dim, + out_dim, + num_infr_tokens, + num_peft_tokens); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Linear::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); } } @@ -782,7 +899,7 @@ void Linear::backward(FFModel const &ff) { launcher.add_field(rid++, FID_DATA); // regions[1](I/O): replica_grad assert(replica == NULL); - if (trainableInputs[0]) { + if (trainable_inputs[0]) { launcher.add_region_requirement( RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, @@ -878,17 +995,17 @@ void Linear::backward_task_with_dim(Task const *task, Runtime *runtime) { // Linear* linear = (Linear*) task->args; LinearMeta const *m = *((LinearMeta **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + + assert(regions.size() == (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + + (5 + static_cast(m->trainable_inputs[0]) + static_cast(m->use_bias))); DT *input_grad = nullptr; size_t rid = 0; TensorAccessorR acc_input( regions[rid], task->regions[rid], FID_DATA, ctx, runtime); rid++; - if (m->trainableInputs[0]) { + if (m->trainable_inputs[0]) { Domain domain = runtime->get_index_space_domain( ctx, task->regions[rid].region.get_index_space()); if (domain.get_dim() == NDIM + 1) { @@ -1119,7 +1236,10 @@ bool Linear::measure_operator_cost(Simulator *sim, int input_n = sub_input.get_volume() / input_c; int output_c = sub_output.dims[0].size; int output_n = sub_output.get_volume() / output_c; - LinearMeta *m = sim->linear_meta; + + MemoryAllocator gpu_mem_allocator(sim->memory); + LinearMeta *m = new LinearMeta( + sim->handler, output_n, this, gpu_mem_allocator, input_c * output_c); m->activation = activation; m->kernel_reg_type = kernel_reg_type; m->kernel_reg_lambda = kernel_reg_lambda; @@ -1164,7 +1284,7 @@ bool Linear::measure_operator_cost(Simulator *sim, }; if (sim->computationMode == COMP_MODE_TRAINING) { void *input_grad_ptr = NULL; - if (trainableInputs[0]) { + if (trainable_inputs[0]) { input_grad_ptr = sim->allocate(sub_input.get_volume(), inputs[0]->data_type); } else { @@ -1313,7 +1433,7 @@ LinearParams Linear::get_params() const { params.kernel_reg_lambda = this->kernel_reg_lambda; params.quantization_type = this->quantization_type; params.offload = this->offload; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc new file mode 100644 index 0000000000..fde6bc2b28 --- /dev/null +++ b/src/ops/lora_linear.cc @@ -0,0 +1,1316 @@ +#include "flexflow/ops/lora_linear.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/layer.h" +#include "flexflow/model.h" +#include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/utils/hash_utils.h" +#include "flexflow/utils/peft_weight_allocator.h" +#include "legion/legion_utilities.h" +#include +#include +#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) +#include "flexflow/utils/cuda_helper.h" +#else +#include "flexflow/utils/hip_helper.h" +#endif + +namespace FlexFlow { + +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::LoraLinear; + +bool check_lora_layer_match(Layer *potential_target, + std::string target_module_name) { + if (potential_target->op_type == OP_LINEAR && + potential_target->name != nullptr && strlen(potential_target->name) > 0) { + std::string s(potential_target->name); + if (s.find(target_module_name) != std::string::npos && + s.find("lora") == std::string::npos) { + return true; + } + } + return false; +} + +PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { + assert(config.enable_peft && + "Cannot add a LoRA layer if PEFT mode is not enabled"); + if (peft_config.target_modules.size() == 0) { + printf("PEFT config does not contain any target module\n"); + std::cout << peft_config << std::endl; + assert(false); + } + PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); + peft_configs[*peft_model_id] = peft_config; + + for (std::string target_module_name : peft_config.target_modules) { + assert(target_module_name.length() > 0 && + "LoRA target module name is empty"); + // find target layer + for (auto it = layers.begin(); it != layers.end(); ++it) { + Layer *target_module = *it; + bool match = check_lora_layer_match(target_module, target_module_name); + if (!match) { + continue; + } + + if (base_layer_to_peft_layer.find(target_module) != + base_layer_to_peft_layer.end()) { + // lora linear layer already added, no need to add again + Layer *peft_layer = base_layer_to_peft_layer[target_module]; + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); + } else { + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + std::string name_ = target_module->name + ? std::string(target_module->name) + : std::string(""); + size_t last_underscore = name_.length() - 1; + for (int i = name_.length() - 1; i > 0; i--) { + if (!(std::isdigit(target_module->name[i]) || + target_module->name[i] == '_')) { + break; + } else if (target_module->name[i] == '_') { + last_underscore = i; + } + } + name_.erase(last_underscore); + + name_ += ".lora"; + std::cout << "Adding layer " << name_ << std::endl; + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + // fix LoRA layer's transformer layer ID and model ID + peft_layer->layer_guid.transformer_layer_id = + target_module->layer_guid.transformer_layer_id; + peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; + } + peft_layer->outputs[0] = + create_tensor_legion_ordering(numdims, + dims, + output->data_type, + peft_layer, + 0, + true /*create_grad*/); + } + it = layers.insert(it + 1, peft_layer); + ++it; + base_layer_to_peft_layer[target_module] = peft_layer; + peft_layer_to_peft_id[peft_layer] = std::vector(); + peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); + } + } + } + + // save finetuned lora model configs to file + if (peft_config.trainable) { + std::string finetuned_model_folder = join_path({ + peft_config.cache_folder, + "finetuned_models", + peft_config.peft_model_id, + }); + fs::remove_all(finetuned_model_folder); + std::string finetuned_model_config_folder = join_path({ + finetuned_model_folder, + "config", + }); + fs::create_directories(finetuned_model_config_folder); + std::string lora_linear_config_filepath = join_path({ + finetuned_model_config_folder, + "ff_config.json", + }); + serialize_to_json_file(peft_config, lora_linear_config_filepath); + std::string optimizer_config_filepath = join_path({ + finetuned_model_config_folder, + "ff_optimizer_config.json", + }); + if (typeid(*peft_config.optimizer_config) == + typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + static_cast( + peft_config.optimizer_config); + serialize_to_json_file(*sgd_config, optimizer_config_filepath); + } else if (typeid(*peft_config.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + LoraAdamOptimizerConfig const *adam_config = + static_cast( + peft_config.optimizer_config); + serialize_to_json_file(*adam_config, optimizer_config_filepath); + } else { + assert(false && "Optimizer not supported"); + } + } + + return peft_model_id; +} + +Op *LoraLinear::create_operator_from_layer( + FFModel &model, + Layer const *layer, + std::vector const &inputs) { + std::unordered_map _peft_configs; + std::vector const &peft_ids = + model.peft_layer_to_peft_id[(Layer *)layer]; + for (int i = 0; i < peft_ids.size(); i++) { + _peft_configs.emplace( + std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]])); + } + return new LoraLinear(model, + layer->layer_guid, + layer->op_type, + inputs[0], + inputs[1], + _peft_configs, + layer->name); +} + +LoraLinear::LoraLinear(FFModel &model, + LoraLinear const &other, + ParallelTensor const input, + ParallelTensor const output) + : LoraLinear(model, + other.layer_guid, + other.op_type, + input, + output, + other.peft_configs, + other.name) {} + +LoraLinear::LoraLinear(FFModel &model, + Params const ¶ms, + Input const &inputs, + char const *name) + : LoraLinear(model, + params.layer_guid, + params.type, + inputs.first, + inputs.second, + params.peft_configs, + params.name) {} + +LoraLinear::LoraLinear( + FFModel &model, + LayerID const &_layer_guid, + OperatorType _op_type, + ParallelTensor const _input, + ParallelTensor const _output, + std::unordered_map const &_peft_configs, + char const *name) + : Op(model, + _op_type, + _output->data_type, + name, + 2 /*inputs*/, + 0 /*weights*/, + false, + 1 /*outputs*/, + _input, + _output) { + assert(_input->data_type == _output->data_type); + // overwrite layer_guid + layer_guid = _layer_guid; + data_type = _output->data_type; + + ParallelTensorShape input_shape = this->inputs[0]->get_shape(); + LoraLinearParams params = this->get_params(); + + // Create output tensor + { + int numdim = inputs[1]->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = inputs[1]->dims[i]; + } + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, inputs[1]->data_type, this); + } + for (auto const &kv : _peft_configs) { + peft_configs.insert(kv); + } + // assert(check_output_input_weight_parallel_dims(allocate_weights)); +} + +void LoraLinear::init(FFModel const &ff) { + assert(false && "LoraLinear does not support normal init"); +} + +void LoraLinear::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); + // assert(check_output_input_weight_same_machine_view()); + // output is considered as an input to allow in-place optimization + ParallelTensor output_tensor = batch_outputs[0]; + parallel_is = output_tensor->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_init_inference(ff, argmap, output_tensor); + IndexLauncher launcher(LORA_LINEAR_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(LoraLinear)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, output_tensor); +} + +template +void load_peft_from_file(DT *ptr, + size_t num_rows, + size_t num_columns, + int num_shards, + int shard_id, + std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(in.good() && "incorrect weight file path"); + + // HuggingFace dims (serialized in row-major order) + // lora_A: [rank, intermediate_dim] + // lora_B: [hidden_dim, rank] + // FlexFlow dims (serialized in column-major order) + // lora_A: [intermediate_dim, rank] + // lora_B: [rank, out_dim] + // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B + assert(num_rows % num_shards == 0); + size_t chunk_size = num_rows / num_shards; + size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0; + + // Allocate memory for the weight shard + std::vector
host_array(chunk_size * num_columns); + // Read the chunk + size_t total_size_read = 0; + for (int i = 0; i < num_columns; ++i) { + in.seekg((i * num_rows + offset) * sizeof(DT)); + in.read(reinterpret_cast(host_array.data() + i * chunk_size), + chunk_size * sizeof(DT)); + total_size_read += in.gcount(); + } + // Check weight shard size + size_t expected_data_size = chunk_size * num_columns * sizeof(DT); + if (total_size_read != expected_data_size) { + printf("load weight data error: expected %lu bytes, got: %lu bytes, data " + "size: %lu\n", + expected_data_size, + total_size_read, + sizeof(DT)); + assert(false); + } + assert(host_array.size() == chunk_size * num_columns); + // Copy weight to device memory + copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns); + in.close(); +} + +/* + regions[0](O): output + regions[1](I): kernel + regions[2](I): bias +*/ +OpMeta *LoraLinear::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LoraLinear const *lora = (LoraLinear *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + GenericTensorAccessorR input = + helperGetGenericTensorAccessorRO(lora->inputs[0]->data_type, + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW output = + helperGetGenericTensorAccessorRW(lora->outputs[0]->data_type, + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + int batch_size = output.domain.get_volume() / out_dim; + assert(input.domain.get_volume() == in_dim * batch_size); + assert(output.domain.get_volume() == out_dim * batch_size); + + LoraLinearMeta *m = new LoraLinearMeta(handle, lora); + m->trainable_inputs[0] = lora->trainable_inputs[0]; + std::strcpy(m->op_name, lora->name); + m->layer_guid = lora->layer_guid; + + int num_shards = lora->inputs[0]->dims[0].degree; + int shard_id = task->index_point.point_data[0]; + int num_dims = lora->inputs[0]->num_dims; + assert(in_dim == lora->inputs[0]->dims[0].size / num_shards); + assert(out_dim == + lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree); + + DataType dt = m->input_type[0]; + assert(dt == m->input_type[1]); + assert(dt == m->output_type[0]); + assert(dt == lora->inputs[0]->data_type); + assert(dt == lora->inputs[1]->data_type); + assert(dt == lora->outputs[0]->data_type); + + // get layer name + assert(lora->name != nullptr && + "Layer name is not set, cannot determine weights location"); + std::string lora_layername = std::string(lora->name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + + for (auto const &kv : lora->peft_configs) { + PEFTModelID const &model_id = kv.first; + LoraLinearConfig const &lora_config = kv.second; + + int rank = lora_config.rank; + + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + // values below represent total weight sizes before sharding. Lora B is not + // sharded. + int lora_A_num_rows = in_dim * num_shards; + int lora_A_num_cols = rank; + int lora_B_num_rows = rank; + int lora_B_num_cols = out_dim; + int lora_A_num_shards = num_shards; + int lora_B_num_shards = 1; + + LoraLinearWeight weight; + weight.in_dim = in_dim; + weight.out_dim = out_dim; + weight.rank = rank; + weight.num_shards = num_shards; + PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; + weight.w0_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + + if (!lora_config.init_lora_weights) { + // load weights from file + std::string weights_folder_filepath = join_path({ + lora_config.cache_folder, + "weights", + lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_B.weight"}); + if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " + << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " + << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " + << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " + << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else { + assert(false && "Data type not supported"); + } + } else { + // initialize weights + int seed = 0; + init_kernel_wrapper(m, seed); + } + + // allocate space for gradients if the LoRA layer is trainable + if (lora_config.trainable) { + // Ensure we have an optimizer + assert(lora_config.optimizer_config != nullptr && "Optimizer not set"); + assert(typeid(*lora_config.optimizer_config) != + typeid(LoraOptimizerConfig) && + "Optimizer config is not a subclass of LoraOptimizerConfig"); + if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { + // Input is partitioned (no replication) + // w0_grad is local weight gradients + weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is sync weight gradients + weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } else { + // Input is replicated + // w0_grad is sync weight gradients + weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + // w1_grad is local weight gradients + weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } + // allocate space for v_values if needed by optimizer + if (typeid(*lora_config.optimizer_config) == + typeid(LoraSGDOptimizerConfig)) { + LoraSGDOptimizerConfig const *sgd_config = + static_cast( + lora_config.optimizer_config); + if (sgd_config->momentum > 0.0f) { + if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { + weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } else { + weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped( + model_id, w0_num_elements * data_type_size(dt)); + weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped( + model_id, w1_num_elements * data_type_size(dt)); + } + } + } else if (typeid(*lora_config.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + assert(false && "Adam optim not yet implemented"); + } else { + assert(false && "Optimizer not supported"); + } + } + assert(m->model_state.find(model_id) == m->model_state.end()); + m->model_state[model_id].weights = weight; + m->model_state[model_id].optimizer_config = lora_config.optimizer_config; + m->model_state[model_id].lora_alpha = lora_config.lora_alpha; + m->model_state[model_id].cache_folder = lora_config.cache_folder; + m->model_state[model_id].peft_model_id = lora_config.peft_model_id; + } + return m; +} + +void LoraLinear::forward(FFModel const &ff) { + assert(false && "LoraLinear does not support normal init"); +} + +FutureMap + LoraLinear::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(check_output_input_weight_same_parallel_is()); + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); + // assert(check_output_input_weight_same_machine_view()); + // output is considered as an input to allow in-place optimization + ParallelTensor output_tensor = batch_outputs[0]; + parallel_is = output_tensor->parallel_is; + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + size_t machine_view_hash = view->hash(); + set_argumentmap_for_inference(ff, argmap, output_tensor); + IndexLauncher launcher(LORA_LINEAR_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void LoraLinear::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + assert(regions.size() == 2); + assert(task->regions.size() == regions.size()); + assert(m->input_type[0] == m->output_type[0]); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); + // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + + // int num_infr_tokens = bc->num_active_infr_tokens(); + // int num_peft_tokens = bc->num_active_peft_tokens(); + inference_kernel_wrapper(m, bc, input, output); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + + // get layer name + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + // print layer name + std::cout << "INF " << lora_layername_substr << std::endl; + + // build output filepath + fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + lora_layername_substr; + dst_filepath /= layername; + + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(dst_filepath.string() + ".batch_config"); + } + + std::string filename = dst_filepath.string() + ".input_0"; + if (input.data_type == DT_FLOAT) { + save_tensor( + input.get_float_ptr(), input.domain.get_volume(), filename.c_str()); + } else if (input.data_type == DT_HALF) { + save_tensor( + input.get_half_ptr(), input.domain.get_volume(), filename.c_str()); + } else { + assert(false); + } + + int rank, num_tokens; + for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { + PEFTModelID peft_model_id = it->first; + LoraLinearWeight weight = m->model_state[peft_model_id].weights; + rank = weight.rank; + num_tokens = input.domain.get_volume() / weight.in_dim; + fs::path dst_filepath_weights = + get_dst_folder("weights", m->decoding_step, shard_id) / layername; + std::string filenameA = + dst_filepath_weights.string() + ".weight_A.original"; + std::string filenameB = + dst_filepath_weights.string() + ".weight_B.original"; + if (m->input_type[0] == DT_FLOAT) { + save_tensor((float *)weight.w0_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((float *)weight.w1_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); + } else if (m->input_type[0] == DT_HALF) { + save_tensor((half *)weight.w0_ptr, + weight.rank * weight.in_dim, + filenameA.c_str()); + save_tensor((half *)weight.w1_ptr, + weight.rank * weight.out_dim, + filenameB.c_str()); + } else { + assert(false && "Data type not supported"); + } + } + + filename = dst_filepath.string() + ".output_0"; + if (output.data_type == DT_FLOAT) { + save_tensor( + output.get_float_ptr(), output.domain.get_volume(), filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor( + output.get_half_ptr(), output.domain.get_volume(), filename.c_str()); + } else { + assert(false); + } + + if (bc->num_active_peft_tokens() > 0) { + // input activation (intermediate) + filename = dst_filepath.string() + ".low_rank_activation"; + if (output.data_type == DT_FLOAT) { + save_tensor((float *)m->low_rank_activation, + rank * num_tokens, + filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor((half *)m->low_rank_activation, + rank * num_tokens, + filename.c_str()); + } else { + assert(false); + } + } + m->decoding_step++; + } +} + +FutureMap LoraLinear::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + assert(batch_inputs.size() == 2); + assert(batch_outputs.size() == 1); + // Assert that the output and the second input are mapped to the same + // region/part + assert(batch_outputs[0]->region == batch_inputs[1]->region); + assert(batch_outputs[0]->part == batch_inputs[1]->part); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + ParallelTensor output_tensor = batch_outputs[0]; + parallel_is = output_tensor->parallel_is; + MachineView const *view = mv ? mv : &output_tensor->machine_view; + set_argumentmap_for_inference(ff, argmap, output_tensor); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void lora_inference_debugging(LoraLinearMeta *m, + BatchConfig const *bc, + GenericTensorAccessorW input_grad, + GenericTensorAccessorR output_grad, + int shard_id) { + // get layer name + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + // print layer name + std::cout << "BWD " << lora_layername_substr << std::endl; + + // build output filepath + fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + lora_layername_substr; + dst_filepath /= layername; + + // save batch config, if passed + if (bc != nullptr) { + bc->save_to_file(dst_filepath.string() + ".batch_config"); + } + + // weights, weights gradients + fs::path dst_filepath_weights = + get_dst_folder("weights", m->bwd_step, shard_id) / layername; + assert(m->model_state.size() >= 1 && "Model state empty!"); + for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { + PEFTModelID peft_model_id = it->first; + LoraLinearWeight weight = m->model_state[peft_model_id].weights; + std::string filename_weight_A = + dst_filepath_weights.string() + ".weight_A.finetuned"; + std::string filename_weight_B = + dst_filepath_weights.string() + ".weight_B.finetuned"; + std::string filename_grad_A = + dst_filepath_weights.string() + ".weight_A.gradient"; + std::string filename_grad_B = + dst_filepath_weights.string() + ".weight_B.gradient"; + if (m->input_type[0] == DT_FLOAT) { + // weight A + save_tensor((float *)weight.w0_ptr, + weight.rank * weight.in_dim, + filename_weight_A.c_str()); + // weight grad A + save_tensor((float *)weight.w0_grad_ptr, + weight.rank * weight.in_dim, + filename_grad_A.c_str()); + // weight B + save_tensor((float *)weight.w1_ptr, + weight.rank * weight.out_dim, + filename_weight_B.c_str()); + // weight grad B + save_tensor((float *)weight.w1_grad_ptr, + weight.rank * weight.out_dim, + filename_grad_B.c_str()); + } else if (m->input_type[0] == DT_HALF) { + // weight A + save_tensor((half *)weight.w0_ptr, + weight.rank * weight.in_dim, + filename_weight_A.c_str()); + // weight grad A + save_tensor((half *)weight.w0_grad_ptr, + weight.rank * weight.in_dim, + filename_grad_A.c_str()); + // weight B + save_tensor((half *)weight.w1_ptr, + weight.rank * weight.out_dim, + filename_weight_B.c_str()); + // weight grad B + save_tensor((half *)weight.w1_grad_ptr, + weight.rank * weight.out_dim, + filename_grad_B.c_str()); + } else { + assert(false && "Data type not supported"); + } + } + + std::string filename = dst_filepath.string() + ".input_gradient_0"; + if (input_grad.data_type == DT_FLOAT) { + save_tensor(input_grad.get_float_ptr(), + input_grad.domain.get_volume(), + filename.c_str()); + } else if (input_grad.data_type == DT_HALF) { + save_tensor(input_grad.get_half_ptr(), + input_grad.domain.get_volume(), + filename.c_str()); + } else { + assert(false); + } + + filename = dst_filepath.string() + ".output_gradient_0"; + if (output_grad.data_type == DT_FLOAT) { + save_tensor(output_grad.get_float_ptr(), + output_grad.domain.get_volume(), + filename.c_str()); + } else if (output_grad.data_type == DT_HALF) { + save_tensor(output_grad.get_half_ptr(), + output_grad.domain.get_volume(), + filename.c_str()); + } else { + assert(false); + } + m->bwd_step++; +} + +template +void save_peft_to_file(DT const *weight_ptr, + size_t size, + std::string filepath) { + std::ofstream out(filepath, std::ios::binary); + // Check if the file was opened successfully + if (!out || !out.is_open() || !out.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(out && out.is_open() && out.good() && + "can't write to lora weight file path"); + std::vector
host_array(size); + copy_tensor_dev_to_host(weight_ptr, host_array.data(), size); + + size_t target_data_size = sizeof(DT) * size; + out.write((char *)host_array.data(), target_data_size); + + size_t out_written_size = out.tellp(); + if (out_written_size != target_data_size) { + printf("save weight data error: %lu, %lu, %lu\n", + out_written_size, + target_data_size, + sizeof(DT)); + assert(false); + } + out.close(); +} + +void save_peft_weights_if_needed(LoraLinearMeta *m, + BatchConfig const *bc, + int in_dim, + int out_dim, + int shard_id) { + std::string lora_layername = std::string(m->op_name); + std::string searchString = "lora"; + size_t found = lora_layername.find(searchString); + if (found == std::string::npos) { + std::cout << "LoraLinear layer name not in the right format (does not " + "contain word 'lora')" + << std::endl; + assert(false); + } + std::string lora_layername_substr = + lora_layername.substr(0, found + searchString.length()); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + // Skip PEFT forward-only requests + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) { + assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != + m->model_state.end()); + std::string weight_export_folder = join_path({ + m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder, + "finetuned_models", + m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id, + "weights", + "shard_" + std::to_string(shard_id), + }); + fs::create_directories(weight_export_folder); + + int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank; + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + std::string w0_filepath = join_path( + {weight_export_folder, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weight_export_folder, lora_layername_substr + "_B.weight"}); + if (m->input_type[0] == DT_FLOAT) { + save_peft_to_file( + (float *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w0_ptr, + w0_num_elements, + w0_filepath); + if (shard_id == 0) { + save_peft_to_file( + (float *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w1_ptr, + w1_num_elements, + w1_filepath); + } + } else if (m->input_type[0] == DT_HALF) { + save_peft_to_file( + (half *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w0_ptr, + w0_num_elements, + w0_filepath); + if (shard_id == 0) { + save_peft_to_file( + (half *)m->model_state[bc->requestsInfo[i].peft_model_id] + .weights.w1_ptr, + w1_num_elements, + w1_filepath); + } + } else { + assert(false && "Data type not supported"); + } + } + } +} + +void LoraLinear::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + Domain input_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(regions.size() == 2); + assert(task->regions.size() == regions.size()); + assert(m->input_type[0] == m->output_type[0]); + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + // int num_infr_tokens = bc->num_active_infr_tokens(); + // int num_peft_tokens = bc->num_active_peft_tokens(); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + + save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id); + + if (m->inference_debugging) { + lora_inference_debugging(m, bc, input_grad, output_grad, shard_id); + } +} + +void LoraLinear::backward(FFModel const &ff) { + assert(false && "LoraLinear does not support normal backward"); +} + +void LoraLinear::print_layer(FFModel const &ff) {} + +void LoraLinear::map_output_tensors(FFModel &ff) { + assert(numOutputs == 1); + assert(numInputs == 2); + assert(outputs[0]->get_volume() == inputs[1]->get_volume()); + outputs[0]->parallel_is = inputs[1]->parallel_is; + outputs[0]->region = inputs[1]->region; + outputs[0]->part = inputs[1]->part; + outputs[0]->region_grad = inputs[1]->region_grad; + outputs[0]->part_grad = inputs[1]->part_grad; +} + +bool LoraLinear::measure_operator_cost(Simulator *sim, + MachineView const &mv, + CostMetrics &cost_metrics) const { + return false; +} + +bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { + if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && + lhs.peft_configs.size() == rhs.peft_configs.size()) { + for (auto const &kv : lhs.peft_configs) { + auto it = rhs.peft_configs.find(kv.first); + if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { + return false; + } + } + return true; + } + return false; +} + +fs::path create_unique_temp_directory() { + std::srand(static_cast(std::time(nullptr))); + + fs::path temp_dir = fs::temp_directory_path(); + fs::path unique_path; + + do { + std::string unique_name = "flexflow_tmp_" + std::to_string(std::rand()); + unique_path = temp_dir / unique_name; + } while (fs::exists(unique_path)); + + fs::create_directory(unique_path); + return unique_path; +} + +void serialize_string(Legion::Serializer &sez, + std::string string_to_serialize) { + sez.serialize(string_to_serialize.length()); + sez.serialize(string_to_serialize.c_str(), string_to_serialize.length()); +} + +std::string deserialize_string(Legion::Deserializer &dez) { + size_t string_size; + char buffer[4096] = {0}; + dez.deserialize(string_size); + dez.deserialize(buffer, string_size); + return std::string(buffer); +} + +void LoraLinear::serialize(Legion::Serializer &sez) const { + sez.serialize(this->layer_guid.id); + sez.serialize(this->layer_guid.transformer_layer_id); + sez.serialize(this->layer_guid.model_id); + sez.serialize(this->op_type); + sez.serialize(this->peft_configs.size()); + for (auto const &kv : this->peft_configs) { + // Serialize PEFTModelID + sez.serialize(kv.first.id); + + // Serialize LoraLinearConfig and OptimizerConfig to tmp folder + // 1. Create tmp dir and serialize it + fs::path unique_temp_dir = create_unique_temp_directory(); + serialize_string(sez, unique_temp_dir.string()); + // 2. Dump LoraLinearConfig to json file in tmp dir + std::string lora_config_filename = std::string("lora_linear_config_") + + std::to_string(kv.first.id) + + std::string(".json"); + fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; + serialize_to_json_file(kv.second, lora_config_json_filepath); + // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type + std::string optimizer_filename = std::string("optimizer_config_") + + std::to_string(kv.first.id) + + std::string(".json"); + fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; + assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr)); + if (kv.second.trainable) { + if (typeid(*kv.second.optimizer_config) == + typeid(LoraSGDOptimizerConfig)) { + sez.serialize(OPTIMIZER_TYPE_SGD); + LoraSGDOptimizerConfig const *sgd_config = + static_cast( + kv.second.optimizer_config); + serialize_to_json_file(*sgd_config, optim_config_filepath); + } else if (typeid(*kv.second.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + sez.serialize(OPTIMIZER_TYPE_ADAM); + LoraAdamOptimizerConfig const *adam_config = + static_cast( + kv.second.optimizer_config); + serialize_to_json_file(*adam_config, optim_config_filepath); + } else { + assert(false && "Optimizer type not yet supported"); + } + } + } + sez.serialize(strlen(this->name)); + sez.serialize(this->name, strlen(this->name)); +} + +/* static */ +using PCG::Node; +Node LoraLinear::deserialize(FFModel &ff, + Legion::Deserializer &dez, + ParallelTensor inputs[], + int num_inputs) { + assert(num_inputs == 2); + size_t id, transformer_layer_id, deserialized_model_id; + OperatorType op_type; + size_t num_pefts; + size_t name_len; + char name[MAX_OPNAME] = {0}; + + LoraLinearParams params; + + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + dez.deserialize(op_type); + dez.deserialize(num_pefts); + for (int i = 0; i < num_pefts; i++) { + // Deserialize PEFTModelID + size_t pid; + dez.deserialize(pid); + PEFTModelID peft_model_id(pid); + // Deserialize tmp folder containing LoraLinearConfig and optimizer config + fs::path unique_temp_dir = fs::path(deserialize_string(dez)); + // 1. Deserialize LoraLinearConfig + std::string lora_config_filename = std::string("lora_linear_config_") + + std::to_string(pid) + + std::string(".json"); + fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; + std::unique_ptr lora_linear_config = + deserialize_from_json_file(lora_config_json_filepath); + // 2. Deserialize optimizer if needed + if (lora_linear_config->trainable) { + std::string optimizer_filename = std::string("optimizer_config_") + + std::to_string(pid) + + std::string(".json"); + fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; + OptimizerType type_; + dez.deserialize(type_); + if (type_ == OPTIMIZER_TYPE_SGD) { + std::unique_ptr sgd_optimizer_config = + deserialize_from_json_file( + optim_config_filepath); + lora_linear_config->optimizer_config = + dynamic_cast(sgd_optimizer_config.release()); + } else if (type_ == OPTIMIZER_TYPE_ADAM) { + std::unique_ptr adam_optimizer_config = + deserialize_from_json_file( + optim_config_filepath); + lora_linear_config->optimizer_config = + dynamic_cast( + adam_optimizer_config.release()); + } else { + printf("Optimizer type: %d\n", type_); + assert(false && "Optimizer type not yet supported"); + } + } + try { + fs::remove_all(unique_temp_dir); + } catch (fs::filesystem_error const &e) { + std::cerr << "Error removing tmp directory: " << e.what() << std::endl; + } + params.peft_configs.emplace( + std::make_pair(peft_model_id, *lora_linear_config)); + } + dez.deserialize(name_len); + dez.deserialize(name, name_len); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); + + params.layer_guid = layer_guid; + params.type = op_type; + strcpy(params.name, name); + return ff.get_or_create_node({inputs[0], inputs[1]}, params); +} + +Op *LoraLinear::materialize(FFModel &ff, + ParallelTensor inputs[], + int num_inputs) const { + LoraLinearParams params = get_params(); + return new LoraLinear(ff, params, {inputs[0], inputs[1]}, this->name); +} + +LoraLinearParams LoraLinear::get_params() const { + LoraLinearParams params; + params.layer_guid = this->layer_guid; + params.type = this->op_type; + if (strlen(this->name) < MAX_OPNAME) { + strcpy(params.name, this->name); + } + params.peft_configs = this->peft_configs; + return params; +} + +bool LoraLinearParams::is_valid( + std::pair const &input_shape) + const { + return true; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::LoraLinearParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.layer_guid.id); + hash_combine(key, params.layer_guid.transformer_layer_id); + hash_combine(key, params.layer_guid.model_id); + for (auto const &kv : params.peft_configs) { + hash_combine(key, kv.first.id); + hash_combine(key, kv.second.rank); + hash_combine(key, kv.second.trainable); + hash_combine(key, kv.second.cache_folder); + hash_combine(key, kv.second.peft_model_id); + hash_combine(key, kv.second.lora_alpha); + hash_combine(key, kv.second.lora_dropout); + hash_combine(key, kv.second.target_modules); + hash_combine(key, kv.second.init_lora_weights); + } + return key; +} +}; // namespace std diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc new file mode 100644 index 0000000000..6e0c60e057 --- /dev/null +++ b/src/ops/lora_linear_params.cc @@ -0,0 +1,221 @@ +#include "flexflow/ops/lora_linear_params.h" +#include +#include +#include +using json = nlohmann::json; + +namespace FlexFlow { + +// ---------------- Optimizer configs ---------------- +// --------------------------------------------------- + +// empty optimizer +LoraOptimizerConfig::LoraOptimizerConfig() {} + +// SGD optimizer +LoraSGDOptimizerConfig::LoraSGDOptimizerConfig() + : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {} + +LoraSGDOptimizerConfig::LoraSGDOptimizerConfig(double lr_, + double momentum_, + bool nesterov_, + bool weight_decay_) + : lr(lr_), momentum(momentum_), nesterov(nesterov_), + weight_decay(weight_decay_) {} + +std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) { + os << "SGD Optimizer (lr=" << llc.lr << ",momentum=" << llc.momentum + << ",nesterov=" << llc.nesterov << ",weight_decay=" << llc.weight_decay + << ")"; + return os; +} + +// Adam optimizer +LoraAdamOptimizerConfig::LoraAdamOptimizerConfig() + : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f), + epsilon(1e-8) {} + +LoraAdamOptimizerConfig::LoraAdamOptimizerConfig(double alpha_, + double beta1_, + double beta2_, + double weight_decay_, + double epsilon_) + : alpha(alpha_), beta1(beta1_), beta2(beta2_), weight_decay(weight_decay_), + epsilon(epsilon_) {} + +std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) { + os << "SGD Optimizer (alpha=" << llc.alpha << ",beta1=" << llc.beta1 + << ",beta2=" << llc.beta2 << ",weight_decay=" << llc.weight_decay + << ",epsilon=" << llc.epsilon << ")"; + return os; +} + +// Serialization helpers +template +void serialize_to_json_file(T const &obj, fs::path const &filepath) { + json j = obj; + std::ofstream file(filepath); + file << j.dump(4); +} + +template +std::unique_ptr deserialize_from_json_file(fs::path const &filepath) { + std::ifstream file(filepath); + json j; + file >> j; + return std::make_unique(j.get()); +} + +template void + serialize_to_json_file(LoraLinearConfig const &obj, + fs::path const &filepath); +template void serialize_to_json_file( + LoraSGDOptimizerConfig const &obj, fs::path const &filepath); +template void serialize_to_json_file( + LoraAdamOptimizerConfig const &obj, fs::path const &filepath); +template std::unique_ptr + deserialize_from_json_file(fs::path const &filepath); +template std::unique_ptr + deserialize_from_json_file( + fs::path const &filepath); +template std::unique_ptr + deserialize_from_json_file( + fs::path const &filepath); + +// ------------------ LoRA configs ------------------- +// --------------------------------------------------- +const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", ""); + +LoraLinearConfig::LoraLinearConfig( + std::string const &cache_folder_, + std::string const &peft_model_id_, + bool trainable_, + LoraOptimizerConfig *optimizer_config_, + bool init_lora_weights_, + std::string const &base_model_name_or_path_, + std::string const &precision_, + int rank_, + float lora_alpha_, + float lora_dropout_, + std::vector const &target_modules_) + : cache_folder(cache_folder_), peft_model_id(peft_model_id_), rank(rank_), + lora_alpha(lora_alpha_), lora_dropout(lora_dropout_), + trainable(trainable_), optimizer_config(optimizer_config_), + init_lora_weights(init_lora_weights_), + base_model_name_or_path(base_model_name_or_path_), precision(precision_), + target_modules(target_modules_) { + + if (peft_model_id.empty()) { + return; + } + assert(!cache_folder.empty() && + "cache_folder must be provided when using PEFT"); + if (trainable) { + assert(optimizer_config != nullptr && + "optimizer_config must be provided when using PEFT"); + assert( + !base_model_name_or_path.empty() && + "base_model_name_or_path must be provided when training a PEFT model"); + assert(!precision.empty() && + "precision must be provided when training a PEFT model"); + } else { + assert(init_lora_weights == false && + "init_lora_weights must be false when LORA not trainable"); + assert(optimizer_config == nullptr && + "optimizer_config must be nullptr when not trainable"); + } + // if we are not initializing LORA from scratch, load the configs from + // existing repository + if (!init_lora_weights) { + std::string peft_inference_config_file_path = + join_path({cache_folder, "configs", peft_model_id, "config.json"}); + std::ifstream config_file(peft_inference_config_file_path); + if (config_file.is_open()) { + try { + json model_config; + config_file >> model_config; + rank = model_config["r"]; + lora_alpha = float(model_config["lora_alpha"]); + lora_dropout = model_config["lora_dropout"]; + for (auto &s : model_config["target_modules"]) { + target_modules.push_back(s); + } + // do not load the base_model_name_or_path from the HF config because we + // may be applying LoRA to another model + } catch (json::exception const &e) { + std::cerr << "Error parsing PEFT config from JSON file: " << e.what() + << std::endl; + assert(false); + } + } else { + std::cerr << "Error opening JSON file " << peft_inference_config_file_path + << std::endl; + assert(false); + } + } + assert(rank > 0 && "rank must be greater than 0"); + assert(lora_alpha > 0.0f && "lora_alpha must be greater than 0.0"); + assert(lora_dropout >= 0.0f && lora_dropout <= 1.0f && + "lora_dropout must be in [0.0, 1.0]"); + assert(target_modules.size() > 0 && "target_modules must not be left empty"); +} + +// constructor used to support unordered_map +LoraLinearConfig::LoraLinearConfig() : LoraLinearConfig("", "") {} + +bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) { + if (lhs.cache_folder == rhs.cache_folder && + lhs.peft_model_id == rhs.peft_model_id && lhs.rank == rhs.rank && + lhs.lora_alpha == rhs.lora_alpha && + lhs.lora_dropout == rhs.lora_dropout && + lhs.target_modules.size() == rhs.target_modules.size() && + lhs.trainable == rhs.trainable && + lhs.init_lora_weights == rhs.init_lora_weights && + lhs.optimizer_config == rhs.optimizer_config && + lhs.base_model_name_or_path == rhs.base_model_name_or_path && + lhs.precision == rhs.precision) { + for (int i = 0; i < lhs.target_modules.size(); i++) { + if (lhs.target_modules[i] != rhs.target_modules[i]) { + return false; + } + } + return true; + } + return false; +} + +std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { + os << "LoraLinearConfig: "; + os << "cache_folder: " << llc.cache_folder << ", "; + os << "peft_model_id: " << llc.peft_model_id << ", "; + os << "rank: " << llc.rank << ", "; + os << "lora_alpha: " << llc.lora_alpha << ", "; + os << "lora_dropout: " << llc.lora_dropout << ", "; + os << "target_modules: ["; + for (int i = 0; i < llc.target_modules.size(); i++) { + os << llc.target_modules[i]; + if (i < llc.target_modules.size() - 1) { + os << ", "; + } + } + os << "], "; + os << "trainable: " << llc.trainable << ", "; + if (llc.optimizer_config != nullptr) { + os << "optimizer_config: "; + if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + os << *static_cast(llc.optimizer_config); + } else if (typeid(*llc.optimizer_config) == + typeid(LoraAdamOptimizerConfig)) { + os << *static_cast(llc.optimizer_config); + } else { + os << "Unknown optimizer config type"; + } + std::cout << std::endl; + } + os << "init_lora_weights: " << llc.init_lora_weights << std::endl; + os << "base_model_name_or_path: " << llc.base_model_name_or_path << std::endl; + os << "precision: " << llc.precision << std::endl; + return os; +} + +}; // namespace FlexFlow diff --git a/src/ops/mean.cc b/src/ops/mean.cc index b2ec94fdf8..0d41276735 100644 --- a/src/ops/mean.cc +++ b/src/ops/mean.cc @@ -87,8 +87,7 @@ OpMeta *Mean::init_task(Task const *task, Context ctx, Runtime *runtime) { FFHandler handler = *((FFHandler const *)task->local_args); - OpMeta *m = new OpMeta(handler); - return m; + return nullptr; } void Mean::forward(FFModel const &ff) {} diff --git a/src/ops/noop.cc b/src/ops/noop.cc index da2d4922e3..45bd76d59d 100644 --- a/src/ops/noop.cc +++ b/src/ops/noop.cc @@ -90,8 +90,9 @@ OpMeta *NoOp::init_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { + NoOp *no_op = (NoOp *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - OpMeta *m = new OpMeta(handle); + OpMeta *m = new OpMeta(handle, no_op); return m; } @@ -167,7 +168,7 @@ void NoOp::init_inference(FFModel const &ff, set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, @@ -244,7 +245,7 @@ void NoOp::init(FFModel const &ff) { set_argumentmap_for_init(ff, argmap); IndexLauncher launcher(NOOP_INIT_TASK_ID, parallel_is, - TaskArgument(NULL, 0), + TaskArgument(this, sizeof(NoOp)), argmap, Predicate::TRUE_PRED, false /*must*/, diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc index 4621ab5909..c8b194afa9 100644 --- a/src/ops/pool_2d.cc +++ b/src/ops/pool_2d.cc @@ -315,7 +315,7 @@ OpMeta *Pool2D::init_task(Task const *task, assert(task->regions.size() == 2); Pool2D const *pool = (Pool2D *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - Pool2DMeta *m = new Pool2DMeta(handle); + Pool2DMeta *m = new Pool2DMeta(handle, pool); m->profiling = pool->profiling; m->inference_debugging = pool->inference_debugging; std::strcpy(m->op_name, pool->name); @@ -545,7 +545,7 @@ bool Pool2D::measure_operator_cost(Simulator *sim, int output_n = sub_output.dims[3].size; int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; - Pool2DMeta *m = sim->pool2d_meta; + Pool2DMeta *m = new Pool2DMeta(sim->handler, this); init_kernel(m, input_w, diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc index 454a35caf4..1c0566e9ca 100644 --- a/src/ops/reduce.cc +++ b/src/ops/reduce.cc @@ -41,7 +41,7 @@ ReduceParams Reduce::get_params() const { } params.keepdims = keepdims; params.layer_guid = this->layer_guid; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/reduce.cpp b/src/ops/reduce.cpp index c062955ed6..fe122b13eb 100644 --- a/src/ops/reduce.cpp +++ b/src/ops/reduce.cpp @@ -25,7 +25,7 @@ using Legion::Domain; ReduceMeta::ReduceMeta(FFHandler handler, Reduce const *rd, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, rd) { checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/reduce.cu b/src/ops/reduce.cu index 65efd90e9b..1352787a12 100644 --- a/src/ops/reduce.cu +++ b/src/ops/reduce.cu @@ -24,7 +24,7 @@ using Legion::Domain; ReduceMeta::ReduceMeta(FFHandler handler, Reduce const *rd, Domain const &input_domain) - : OpMeta(handler) { + : OpMeta(handler, rd) { checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc index 49f99e2cb5..4e7fd2eb96 100644 --- a/src/ops/reshape.cc +++ b/src/ops/reshape.cc @@ -180,7 +180,7 @@ OpMeta *Reshape::init_task(Task const *task, Runtime *runtime) { Reshape const *reshape = (Reshape *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - ReshapeMeta *m = new ReshapeMeta(handle); + ReshapeMeta *m = new ReshapeMeta(handle, reshape); std::strcpy(m->op_name, reshape->name); m->layer_guid = reshape->layer_guid; m->data_type = reshape->outputs[0]->data_type; @@ -296,7 +296,7 @@ ReshapeParams Reshape::get_params() const { ReshapeParams params; params.shape = shape_vec; params.layer_guid = this->layer_guid; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 8dd670eea3..2a30d12d6d 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -44,7 +44,8 @@ bool operator==(ResidualLayerNormParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes && lhs.elementwise_affine == rhs.elementwise_affine && lhs.use_bias == rhs.use_bias && - lhs.use_two_residuals == rhs.use_two_residuals; + lhs.use_two_residuals == rhs.use_two_residuals && + lhs.inplace_residual == rhs.inplace_residual; } bool ResidualLayerNormParams::is_valid( @@ -63,7 +64,8 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const { params.eps = this->eps; params.use_bias = this->use_bias; params.use_two_residuals = this->use_two_residuals; - if (this->name != nullptr) { + params.inplace_residual = this->inplace_residual; + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -78,6 +80,7 @@ void FFModel::residual_layer_norm(const Tensor input, bool elementwise_affine, float eps, bool use_bias, + bool inplace_residual, DataType data_type, char const *name) { // In PyTorch, axes must be the sizes of the last axes.size() dimensions of @@ -117,7 +120,6 @@ void FFModel::residual_layer_norm(const Tensor input, } int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0; - Layer *ln = nullptr; Tensor casted_input = (data_type != input->data_type) ? cast(input, data_type, "type cast for residual_layer_norm") @@ -133,20 +135,20 @@ void FFModel::residual_layer_norm(const Tensor input, ? cast(residual2, data_type, "type cast for residual2_layer_norm") : residual2; } - ln = new Layer(this, - OP_RESIDUAL_LAYERNORM, - data_type, - name, - 2 + use_two_residuals /*inputs*/, - num_weights, - 2 /*outputs*/, - casted_input, - casted_residual1, - casted_residual2); + Layer *ln = new Layer(this, + OP_RESIDUAL_LAYERNORM, + data_type, + name, + 2 + use_two_residuals /*inputs*/, + num_weights, + 2 /*outputs*/, + casted_input, + casted_residual1, + casted_residual2); ln->outputs[0] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/); ln->outputs[1] = create_tensor_legion_ordering( - input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/); + input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/); { int numdims = axes.size(); int dims[numdims]; @@ -179,6 +181,7 @@ void FFModel::residual_layer_norm(const Tensor input, ln->add_int_vector_property("axes", axes); ln->add_float_property("eps", eps); ln->add_int_property("use_two_residuals", use_two_residuals); + ln->add_int_property("inplace_residual", inplace_residual); layers.push_back(ln); outputs[0] = ln->outputs[0]; outputs[1] = ln->outputs[1]; @@ -199,6 +202,9 @@ Op *ResidualLayerNorm::create_operator_from_layer( layer->get_float_property("eps", eps); layer->get_int_property("use_two_residuals", value); bool use_two_residuals = (bool)value; + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; + return new ResidualLayerNorm(model, layer->layer_guid, inputs[0], @@ -209,6 +215,7 @@ Op *ResidualLayerNorm::create_operator_from_layer( elementwise_affine, use_bias, eps, + inplace_residual, false, // allocate_weights layer->name); } @@ -230,6 +237,7 @@ ResidualLayerNorm::ResidualLayerNorm( params.elementwise_affine, params.use_bias, params.eps, + params.inplace_residual, allocate_weights, params.name) {} @@ -243,6 +251,7 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, bool _elementwise_affine, bool _use_bias, float _eps, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -256,7 +265,8 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, _residual1, _use_two_residuals ? _residual2 : nullptr), elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes), - use_bias(_use_bias), use_two_residuals(_use_two_residuals) { + use_bias(_use_bias), use_two_residuals(_use_two_residuals), + inplace_residual(_inplace_residual) { // overwrite layer_guid layer_guid = _layer_guid; outputs[0] = model.create_parallel_tensor_legion_ordering( @@ -326,6 +336,22 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model, } } +void ResidualLayerNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } +} + void ResidualLayerNorm::init_inference( FFModel const &ff, std::vector const &batch_inputs, @@ -347,13 +373,19 @@ void ResidualLayerNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } int field_id = 0; // input - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); + // added: input + residual(s) + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, @@ -371,13 +403,15 @@ void ResidualLayerNorm::init_inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -422,13 +456,17 @@ void ResidualLayerNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); int field_id = 0; // input - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); + // added: input + residual(s) + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(inputs[1]->part, @@ -439,20 +477,21 @@ void ResidualLayerNorm::init(FFModel const &ff) { launcher.add_field(field_id++, FID_DATA); // residual2 if (use_two_residuals) { - launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + launcher.add_region_requirement(RegionRequirement(inputs[2]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, - inputs[1]->region)); + inputs[2]->region)); + launcher.add_field(field_id++, FID_DATA); + } + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); // layer norm output launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, @@ -516,7 +555,323 @@ void ResidualLayerNorm::forward(FFModel const &ff) { } void ResidualLayerNorm::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RESIDUAL_LAYERNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + int field_id = 0; + // output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // added output + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_two_residuals) { + // residual grad 2 + launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + // gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_bias) { + // beta_grad + launcher.add_region_requirement( + RegionRequirement(weights[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + } + runtime->execute_index_space(ctx, launcher); +} + +void ResidualLayerNorm::backward_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + ResidualLayerNormMeta const *m = + *((ResidualLayerNormMeta **)task->local_args); + assert(regions.size() == + 4 + m->use_two_residuals + + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0)); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR added_output = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual1_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual2_grad; + if (m->use_two_residuals) { + residual2_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorR gamma; + GenericTensorAccessorW gamma_grad, beta_grad; + if (m->elementwise_affine) { + assert(m->use_bias == (regions.size() == 6)); + gamma = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + gamma_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + if (m->use_bias) { + beta_grad = + helperGetGenericTensorAccessorRW(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + } + ResidualLayerNorm::backward_kernel_wrapper(m, + output_grad, + added_output, + input_grad, + residual1_grad, + residual2_grad, + gamma, + gamma_grad, + beta_grad); +} + +Legion::FutureMap ResidualLayerNorm::peft_bwd( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int field_id = 0; + // output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // input grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + // residual grad 1 + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + if (use_two_residuals) { + // residual grad 2 + launcher.add_region_requirement( + RegionRequirement(batch_inputs[2]->part_grad, + 0 /*projection id*/, + reset_input_grads[2] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[2]->region_grad)); + launcher.add_field(field_id++, FID_DATA); + } + if (elementwise_affine) { + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); +} + +void ResidualLayerNorm::peft_bwd_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + assert(task->regions.size() == regions.size()); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); + assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine); + + int region_idx = 0, task_region_idx = 0; + + GenericTensorAccessorR output_grad = + helperGetGenericTensorAccessorRO(m->output_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual1_grad = + helperGetGenericTensorAccessorRW(m->input_type[1], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual2_grad; + if (m->use_two_residuals) { + GenericTensorAccessorW residual2_grad = + helperGetGenericTensorAccessorRW(m->input_type[2], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + GenericTensorAccessorR gamma; + if (m->elementwise_affine) { + gamma = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } + ResidualLayerNorm::peft_bwd_kernel_wrapper( + m, output_grad, input_grad, residual1_grad, residual2_grad, gamma); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + std::vector input_accessors; + input_accessors.push_back(input_grad); + input_accessors.push_back(residual1_grad); + if (m->use_two_residuals) { + input_accessors.push_back(residual2_grad); + } + std::vector weights_accessors; + if (m->elementwise_affine) { + weights_accessors.push_back(gamma); + } + ResidualLayerNorm::save_inference_tensors_to_file(m, + shard_id, + bc, + input_accessors, + weights_accessors, + {output_grad}, + false); + } } Op *ResidualLayerNorm::materialize(FFModel &ff, @@ -554,13 +909,19 @@ FutureMap ResidualLayerNorm::inference( 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } int field_id = 0; // input - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); + // added: input + residual(s) + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); launcher.add_field(field_id++, FID_DATA); // residual1 launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, @@ -578,13 +939,15 @@ FutureMap ResidualLayerNorm::inference( batch_inputs[2]->region)); launcher.add_field(field_id++, FID_DATA); } - // added: input + residual(s) - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(field_id++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(field_id++, FID_DATA); + } // layer norm output launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, @@ -620,14 +983,13 @@ void ResidualLayerNorm::inference_task( assert(task->regions.size() == regions.size()); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); if (bc->num_tokens == 0) { return; } - ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args); - assert(regions.size() == - 4 + m->use_two_residuals + + 3 + m->use_two_residuals + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); int region_idx = 0, task_region_idx = 0; @@ -655,13 +1017,23 @@ void ResidualLayerNorm::inference_task( ctx, runtime); } - GenericTensorAccessorW added_output = - helperGetGenericTensorAccessorWO(m->output_type[0], - regions[region_idx++], - task->regions[task_region_idx++], - FID_DATA, - ctx, - runtime); + GenericTensorAccessorW added_output; + if (m->inplace_residual) { + added_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + added_output = + helperGetGenericTensorAccessorWO(m->output_type[0], + regions[region_idx++], + task->regions[task_region_idx++], + FID_DATA, + ctx, + runtime); + } GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(m->output_type[1], regions[region_idx++], @@ -699,8 +1071,14 @@ void ResidualLayerNorm::inference_task( assert(in_domain.get_volume() == residual2_domain.get_volume()); assert(residual2_domain == in_domain); } - Domain added_out_domain = runtime->get_index_space_domain( - ctx, task->regions[task_region_idx++].region.get_index_space()); + Domain added_out_domain; + if (m->inplace_residual) { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + } else { + added_out_domain = runtime->get_index_space_domain( + ctx, task->regions[task_region_idx++].region.get_index_space()); + } Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[task_region_idx++].region.get_index_space()); Domain gamma_domain, beta_domain; @@ -734,13 +1112,13 @@ void ResidualLayerNorm::inference_task( m->effective_num_elements * m->effective_batch_size); ResidualLayerNorm::inference_kernel_wrapper( - m, input, residual1, residual2, added_output, output, gamma, beta); + m, bc, input, residual1, residual2, added_output, output, gamma, beta); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; std::vector input_accessors; - input_accessors.push_back(input); + // input_accessors.push_back(input); input_accessors.push_back(residual1); if (m->use_two_residuals) { input_accessors.push_back(residual2); @@ -779,6 +1157,7 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->eps); sez.serialize(this->use_bias); sez.serialize(this->use_two_residuals); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -794,6 +1173,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, bool elementwise_affine; bool use_bias; bool use_two_residuals; + bool inplace_residual; float eps; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); @@ -810,6 +1190,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, dez.deserialize(eps); dez.deserialize(use_bias); dez.deserialize(use_two_residuals); + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -827,6 +1208,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff, params.eps = eps; params.use_bias = use_bias; params.use_two_residuals = use_two_residuals; + params.inplace_residual = inplace_residual; strcpy(params.name, name); if (use_two_residuals) { return ff.get_or_create_node( @@ -853,6 +1235,7 @@ size_t hash::operator()( hash_combine(key, params.elementwise_affine); hash_combine(key, params.use_bias); hash_combine(key, params.use_two_residuals); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index f1b7a537b0..582e0752ef 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -23,11 +23,12 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, ResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; use_two_residuals = ln->use_two_residuals; @@ -36,6 +37,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, profiling = ln->profiling; inference_debugging = ln->inference_debugging; eps = ln->eps; + inplace_residual = ln->inplace_residual; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); @@ -45,6 +47,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } ResidualLayerNormMeta::~ResidualLayerNormMeta(void) { @@ -75,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -84,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -110,8 +111,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T residual2_val = (residual2_ptr == nullptr) ? T(0) @@ -120,12 +120,10 @@ __global__ void ResidualLayerNormKernel(int64_t N, sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -137,7 +135,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -161,19 +159,9 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, T const *beta_ptr, hipStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualLayerNormKernel), - num_blocks, - num_threads, + m->effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), 0, stream, m->effective_num_elements, @@ -188,10 +176,41 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, beta_ptr, output_ptr); } +template +void save_inference_tensors(ResidualLayerNormMeta const *m) { + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "fwd_step_" + std::to_string(m->decoding_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } +} /*static*/ void ResidualLayerNorm::inference_kernel_wrapper( - ResidualLayerNormMeta const *m, + ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -208,12 +227,13 @@ void ResidualLayerNorm::inference_kernel_wrapper( checkCUDA(hipEventCreate(&t_end)); checkCUDA(hipEventRecord(t_start, stream)); } + if (m->input_type[0] == DT_FLOAT) { ResidualLayerNorm::inference_kernel( m, input.get_float_ptr(), residual1.get_float_ptr(), - residual2.get_float_ptr(), + m->use_two_residuals ? residual2.get_float_ptr() : nullptr, added_output.get_float_ptr(), output.get_float_ptr(), m->elementwise_affine ? gamma.get_float_ptr() : nullptr, @@ -224,7 +244,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( m, input.get_half_ptr(), residual1.get_half_ptr(), - residual2.get_half_ptr(), + m->use_two_residuals ? residual2.get_half_ptr() : nullptr, added_output.get_half_ptr(), output.get_half_ptr(), m->elementwise_affine ? gamma.get_half_ptr() : nullptr, @@ -234,6 +254,76 @@ void ResidualLayerNorm::inference_kernel_wrapper( assert(false && "unsupport datatype in layernorm"); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->inference_debugging) { + if (m->input_type[0] == DT_FLOAT) { + save_inference_tensors(m); + } else if (m->input_type[0] == DT_HALF) { + save_inference_tensors(m); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); @@ -245,4 +335,551 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual1_i = dX_residual1 + i1 * N; + T *dX_residual2_i = + (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad1) { + dX_residual1_i[l] = f_grad_input; + } else { + dX_residual1_i[l] += f_grad_input; + } + if (dX_residual2 != nullptr) { + if (reset_residual_grad2) { + dX_residual2_i[l] = f_grad_input; + } else { + dX_residual2_i[l] += f_grad_input; + } + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual1, + dX_residual2, + reset_input_grad, + reset_residual_grad1, + reset_residual_grad2, + N, + buf); +} + +/*static*/ +template +void backward_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel), + M, + kCUDABlockReduceNumThreads, + 0, + stream, + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel), + B, + kCUDANumThreads, + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel), + B, + dim3(kThreadX, kThreadY), + 0, + stream, + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void ResidualLayerNorm::backward_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void peft_bwd_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + hipStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel), + blocks, + num_threads, + nshared, + stream, + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); +} + +/*static*/ +void ResidualLayerNorm::peft_bwd_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index e5ebdce6ed..8cdf87a92c 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -22,11 +22,12 @@ namespace FlexFlow { #define C10_WARP_SIZE 32 constexpr int kCUDABlockReduceNumThreads = 512; constexpr int kCUDANumThreads = 256; +constexpr int kColwiseReduceTileSize = 32; ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, ResidualLayerNorm const *ln, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ln) { elementwise_affine = ln->elementwise_affine; use_bias = ln->use_bias; use_two_residuals = ln->use_two_residuals; @@ -35,6 +36,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, profiling = ln->profiling; inference_debugging = ln->inference_debugging; eps = ln->eps; + inplace_residual = ln->inplace_residual; DataType data_type = ln->data_type; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3; gpu_mem_allocator.create_legion_instance(reserveInst, totalSize); @@ -44,6 +46,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, data_type_size(data_type) * effective_batch_size); bias_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * effective_batch_size); + allocated_peft_buffer_size = 0; } ResidualLayerNormMeta::~ResidualLayerNormMeta(void) { @@ -74,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) { } template -__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { +__inline__ __device__ T BlockReduceSum(T val, T *shared) { int const lid = threadIdx.x % C10_WARP_SIZE; int const wid = threadIdx.x / C10_WARP_SIZE; val = WarpReduceSum(val); @@ -83,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) { shared[wid] = val; } __syncthreads(); - val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE) - ? shared[lid] - : 0; + val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0); if (wid == 0) { val = WarpReduceSum(val); } @@ -109,8 +110,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, const int64_t i = blockIdx.x; float sum1 = 0.0f; float sum2 = 0.0f; - for (int64_t j = threadIdx.x; j < N; - j += min(blockDim.x, kCUDABlockReduceNumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T residual2_val = (residual2_ptr == nullptr) ? T(0) @@ -119,12 +119,10 @@ __global__ void ResidualLayerNormKernel(int64_t N, sum1 += static_cast(X[index]); sum2 += static_cast(X[index]) * static_cast(X[index]); } - if (threadIdx.x < kCUDABlockReduceNumThreads) { - sum1 = BlockReduceSum( - sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - sum2 = BlockReduceSum( - sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads)); - } + + sum1 = BlockReduceSum(sum1, m_shared); + sum2 = BlockReduceSum(sum2, v_shared); + if (threadIdx.x == 0) { float const scale = float(1) / static_cast(N); sum1 *= scale; @@ -136,7 +134,7 @@ __global__ void ResidualLayerNormKernel(int64_t N, __syncthreads(); using T_ACC = T; - for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) { + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { const int64_t index = i * N + j; const T_ACC gamma_v = gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); @@ -160,33 +158,57 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, T const *beta_ptr, cudaStream_t stream) { - std::pair kernel1_parallelism = - std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads); - std::pair kernel2_parallelism = - std::make_pair(m->effective_batch_size, kCUDANumThreads); - - int num_blocks = - std::max(kernel1_parallelism.first, kernel2_parallelism.first); - int num_threads = - std::max(kernel1_parallelism.second, kernel2_parallelism.second); - ResidualLayerNormKernel - <<>>(m->effective_num_elements, - m->eps, - input_ptr, - residual1_ptr, - residual2_ptr, - added_output_ptr, - static_cast(m->mean_ptr), - static_cast(m->rstd_ptr), - gamma_ptr, - beta_ptr, - output_ptr); + <<effective_batch_size, + std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements), + 0, + stream>>>(m->effective_num_elements, + m->eps, + input_ptr, + residual1_ptr, + residual2_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + beta_ptr, + output_ptr); +} +template +void save_inference_tensors(ResidualLayerNormMeta const *m) { + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "fwd_step_" + std::to_string(m->decoding_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } } /*static*/ void ResidualLayerNorm::inference_kernel_wrapper( - ResidualLayerNormMeta const *m, + ResidualLayerNormMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorR const &residual1, GenericTensorAccessorR const &residual2, @@ -203,6 +225,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + if (m->input_type[0] == DT_FLOAT) { ResidualLayerNorm::inference_kernel( m, @@ -229,6 +252,76 @@ void ResidualLayerNorm::inference_kernel_wrapper( assert(false && "unsupport datatype in layernorm"); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_float_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync( + m->input_activation, + added_output.get_half_ptr() + first_token_offset * in_dim, + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + + if (m->inference_debugging) { + if (m->input_type[0] == DT_FLOAT) { + save_inference_tensors(m); + } else if (m->input_type[0] == DT_HALF) { + save_inference_tensors(m); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); @@ -240,4 +333,529 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } +template +__global__ void ComputeInternalGradientsCUDAKernel( + int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) { + using T_ACC = T; + __shared__ T_ACC ds_shared[C10_WARP_SIZE]; + __shared__ T_ACC db_shared[C10_WARP_SIZE]; + const int64_t i = blockIdx.x; + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t j = threadIdx.x; j < N; j += blockDim.x) { + const int64_t index = i * N + j; + const T_ACC gamma_v = + gamma == nullptr ? T_ACC(1) : static_cast(gamma[j]); + sum1 += + static_cast(dY[index]) * static_cast(X[index]) * gamma_v; + sum2 += static_cast(dY[index]) * gamma_v; + } + sum1 = BlockReduceSum(sum1, ds_shared); + sum2 = BlockReduceSum(sum2, db_shared); + if (threadIdx.x == 0) { + ds[i] = sum1; + db[i] = sum2; + } +} + +template +__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M, + int64_t N, + T const *mean, + T const *rstd, + T const *ds, + T const *db, + T *c1, + T *c2) { + using T_ACC = T; + const int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < M) { + const T_ACC s = T_ACC(1) / static_cast((int)N); + const T_ACC a = (db[index] * static_cast(mean[index]) - ds[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * + static_cast(rstd[index]) * s; + c1[index] = a; + c2[index] = -(a * static_cast(mean[index]) + + db[index] * static_cast(rstd[index]) * s); + } +} + +template +__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j < N) { + T_ACC sum1 = 0; + T_ACC sum2 = 0; + for (int64_t i = 0; i < M; ++i) { + const int64_t index = i * N + j; + sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index]) * + (static_cast(X[index]) - + static_cast(mean[i])) * + static_cast(rstd[i]); + sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index]); + } + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } +} + +template +__global__ void GammaBetaBackwardCUDAKernel(int64_t M, + int64_t N, + T const *dY, + T const *X, + T const *mean, + T const *rstd, + T *dg, + T *db) { + using T_ACC = T; + __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1]; + const int64_t j = blockIdx.x * blockDim.x + threadIdx.x; + T_ACC dg_sum1 = 0; + T_ACC dg_sum2 = 0; + T_ACC db_sum1 = 0; + T_ACC db_sum2 = 0; + if (j < N) { + for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) { + const int64_t i1 = i; + const int64_t i2 = i + blockDim.y; + const int64_t index1 = i1 * N + j; + const int64_t index2 = i2 * N + j; + dg_sum1 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index1]) * + (static_cast(X[index1]) - + static_cast(mean[i1])) * + static_cast(rstd[i1]); + db_sum1 += db == nullptr ? T_ACC(0) : static_cast(dY[index1]); + if (i2 < M) { + dg_sum2 += dg == nullptr ? T_ACC(0) + : static_cast(dY[index2]) * + (static_cast(X[index2]) - + static_cast(mean[i2])) * + static_cast(rstd[i2]); + db_sum2 += db == nullptr ? T_ACC(0) : static_cast(dY[index2]); + } + } + } + g_shared[threadIdx.y][threadIdx.x] = dg_sum1; + g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2; + b_shared[threadIdx.y][threadIdx.x] = db_sum1; + b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2; + __syncthreads(); + T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y]; + T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } + sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y]; + sum1 = WarpReduceSum(sum1); + sum2 = WarpReduceSum(sum2); + if (threadIdx.x == 0) { + const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y; + if (j < N) { + if (dg != nullptr) { + dg[j] = sum1; + } + if (db != nullptr) { + db[j] = sum2; + } + } + } +} + +template +__device__ __inline__ void compute_gI(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N, + T *buf) { + auto const i1 = blockIdx.x; + const T mean_val = mean[i1]; + const T rstd_val = rstd[i1]; + T stats_x1{0}, stats_x2{0}; + constexpr int unroll = 4; + auto l = unroll * threadIdx.x; + T const *X_i = X + i1 * N; + T const *dY_i = dY + i1 * N; + T *dX_i = dX + i1 * N; + T *dX_residual1_i = dX_residual1 + i1 * N; + T *dX_residual2_i = + (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr; + // vectorized reads don't improve perf, so use regular unrolling + + for (; l + unroll - 1 < N; l += blockDim.x * unroll) { +#pragma unroll + for (int k = 0; k < unroll; k++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l + k]) : T(1); + const T c_h = static_cast(X_i[l + k]); + const T c_loss = static_cast(dY_i[l + k]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + } + for (; l < N; l++) { + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + const T c_h = static_cast(X_i[l]); + const T c_loss = static_cast(dY_i[l]); + stats_x1 += c_loss * gamma_val; + stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val; + } + + stats_x1 = BlockReduceSum(stats_x1, buf); + stats_x2 = BlockReduceSum(stats_x2, buf); + if (threadIdx.x == 0) { + buf[0] = stats_x1; + buf[1] = stats_x2; + } + __syncthreads(); + stats_x1 = buf[0]; + stats_x2 = buf[1]; + T fH = N; + T term1 = (T(1) / fH) * rstd_val; + + for (int l = threadIdx.x; l < N; l += blockDim.x) { + const T x = X_i[l]; + const T dy = dY_i[l]; + T gamma_val = (gamma != nullptr) ? static_cast(gamma[l]) : T(1); + T f_grad_input = fH * gamma_val * dy; + f_grad_input -= (x - mean_val) * rstd_val * stats_x2; + f_grad_input -= stats_x1; + f_grad_input *= term1; + if (reset_input_grad) { + dX_i[l] = f_grad_input; + } else { + dX_i[l] += f_grad_input; + } + if (reset_residual_grad1) { + dX_residual1_i[l] = f_grad_input; + } else { + dX_residual1_i[l] += f_grad_input; + } + if (dX_residual2 != nullptr) { + if (reset_residual_grad2) { + dX_residual2_i[l] = f_grad_input; + } else { + dX_residual2_i[l] += f_grad_input; + } + } + } +} + +template +__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY, + T const *__restrict__ X, + T const *__restrict__ mean, + T const *__restrict__ rstd, + T const *__restrict__ gamma, + T *dX, + T *dX_residual1, + T *dX_residual2, + bool reset_input_grad, + bool reset_residual_grad1, + bool reset_residual_grad2, + int const N) { + alignas(sizeof(double)) extern __shared__ char s_data1[]; + T *buf = reinterpret_cast(&s_data1); + compute_gI(dY, + X, + mean, + rstd, + gamma, + dX, + dX_residual1, + dX_residual2, + reset_input_grad, + reset_residual_grad1, + reset_residual_grad2, + N, + buf); +} + +/*static*/ +template +void backward_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T const *added_output_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + T *gamma_grad_ptr, + T *beta_grad_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + ComputeInternalGradientsCUDAKernel + <<>>( + N, + output_grad_ptr, + added_output_ptr, + gamma_ptr, + static_cast(m->ds_ptr), + static_cast(m->db_ptr)); + const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; + ComputeGradientFusedParamsCUDAKernel + <<>>(M, + N, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + static_cast(m->ds_ptr), + static_cast(m->db_ptr), + static_cast(m->scale_ptr), + static_cast(m->bias_ptr)); + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); + + if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) { + if (M < 512) { + // For small batch size, do colwise reduce directly + const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads; + GammaBetaBackwardSimpleCUDAKernel + <<>>(M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } else { + const int64_t B = + (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; + constexpr int kThreadX = kColwiseReduceTileSize; + constexpr int kThreadY = kColwiseReduceTileSize / 2; + GammaBetaBackwardCUDAKernel + <<>>( + M, + N, + output_grad_ptr, + added_output_ptr, + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_grad_ptr, + beta_grad_ptr); + } + } +} + +/*static*/ +void ResidualLayerNorm::backward_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &added_output, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + backward_kernel( + m, + output_grad.get_float_ptr(), + added_output.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr() + : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + backward_kernel( + m, + output_grad.get_half_ptr(), + added_output.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr, + (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr() + : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed); + } +} + +/*static*/ +template +void peft_bwd_kernel(ResidualLayerNormMeta const *m, + T const *output_grad_ptr, + T *input_grad_ptr, + T *residual1_grad_ptr, + T *residual2_grad_ptr, + T const *gamma_ptr, + cudaStream_t stream) { + const int64_t M = m->effective_batch_size; + const int64_t N = m->effective_num_elements; + + if (m->inference_debugging) { + // save stuff here + std::string op_name_without_uid = + ResidualLayerNorm::get_op_name_without_uid(m); + char const *folder_path = "./inference_tensors/"; + std::string base_filepath = std::string(folder_path); + if (m->layer_guid.model_id > 0) { + base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_"; + } + base_filepath += "bwd_step_" + std::to_string(m->bwd_step); + base_filepath += "_layers_" + + std::to_string(m->layer_guid.transformer_layer_id) + "_" + + op_name_without_uid + "_shard_" + std::to_string(0); + + std::string filename1 = base_filepath + "_mean"; + save_tensor(static_cast(m->mean_ptr), + m->effective_batch_size, + filename1.c_str()); + std::string filename2 = base_filepath + "_rstd"; + save_tensor(static_cast(m->rstd_ptr), + m->effective_batch_size, + filename2.c_str()); + std::string filename3 = base_filepath + "_input_activation"; + save_tensor(static_cast(m->input_activation), + m->effective_batch_size * m->effective_num_elements, + filename3.c_str()); + } + + int const warp_size = C10_WARP_SIZE; + int const num_threads = 128; + const dim3 blocks(M); + int nshared = (num_threads / warp_size) * sizeof(T); + + layer_norm_grad_input_kernel<<>>( + output_grad_ptr, + static_cast(m->input_activation), + static_cast(m->mean_ptr), + static_cast(m->rstd_ptr), + gamma_ptr, + input_grad_ptr, + residual1_grad_ptr, + residual2_grad_ptr, + m->reset_input_grads[0], + m->reset_input_grads[1], + m->reset_input_grads[2], + N); +} + +/*static*/ +void ResidualLayerNorm::peft_bwd_kernel_wrapper( + ResidualLayerNormMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorW const &residual1_grad, + GenericTensorAccessorW const &residual2_grad, + GenericTensorAccessorR const &gamma) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + if (m->output_type[0] == DT_FLOAT) { + peft_bwd_kernel(m, + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + residual1_grad.get_float_ptr(), + m->use_two_residuals ? residual2_grad.get_float_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_float_ptr() : nullptr, + stream); + } else if (m->output_type[0] == DT_HALF) { + peft_bwd_kernel(m, + output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + residual1_grad.get_half_ptr(), + m->use_two_residuals ? residual2_grad.get_half_ptr() + : nullptr, + m->elementwise_affine ? gamma.get_half_ptr() : nullptr, + stream); + } else { + assert(false && "Unsupported data type"); + } + + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc index b3ee7179d0..744902f908 100644 --- a/src/ops/residual_rms_norm.cc +++ b/src/ops/residual_rms_norm.cc @@ -42,7 +42,8 @@ using namespace FlexFlow::Kernels::ResidualRMSNorm; bool operator==(ResidualRMSNormParams const &lhs, ResidualRMSNormParams const &rhs) { - return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps; + return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps && + lhs.dim == rhs.dim && lhs.inplace_residual == rhs.inplace_residual; } bool ResidualRMSNormParams::is_valid( @@ -55,7 +56,8 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; - if (this->name != nullptr) { + params.inplace_residual = this->inplace_residual; + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -66,6 +68,7 @@ void FFModel::residual_rms_norm(const Tensor input1, Tensor *outputs, float eps, int dim, + bool inplace_residual, DataType data_type, char const *name) { if (data_type == DT_NONE) { @@ -90,9 +93,9 @@ void FFModel::residual_rms_norm(const Tensor input1, casted_input2); rm->outputs[0] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/); rm->outputs[1] = create_tensor_legion_ordering( - input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/); + input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/); // weights int weight_dims[1] = {dim}; @@ -100,12 +103,13 @@ void FFModel::residual_rms_norm(const Tensor input1, weight_dims, data_type, rm, - true /*create_grad*/, + false /*create_grad*/, nullptr, CHOSEN_SYNC_TYPE); rm->add_float_property("eps", eps); rm->add_int_property("dim", dim); + rm->add_int_property("inplace_residual", inplace_residual); layers.push_back(rm); outputs[0] = rm->outputs[0]; outputs[1] = rm->outputs[1]; @@ -120,6 +124,8 @@ Op *ResidualRMSNorm::create_operator_from_layer( long long value; layer->get_int_property("dim", value); int dim = value; + layer->get_int_property("inplace_residual", value); + bool inplace_residual = (bool)value; return new ResidualRMSNorm(model, layer->layer_guid, @@ -127,6 +133,7 @@ Op *ResidualRMSNorm::create_operator_from_layer( inputs[1], eps, dim, + inplace_residual, false, layer->name); } @@ -143,6 +150,7 @@ ResidualRMSNorm::ResidualRMSNorm( inputs.second, params.eps, params.dim, + params.inplace_residual, allocate_weights, params.name) {} @@ -157,6 +165,7 @@ ResidualRMSNorm::ResidualRMSNorm( inputs.second, other.eps, other.dim, + other.inplace_residual, allocate_weights, other.name) {} ResidualRMSNorm::ResidualRMSNorm(FFModel &model, @@ -165,6 +174,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, const ParallelTensor _input2, float _eps, int dim, + bool _inplace_residual, bool allocate_weights, char const *name) : Op(model, @@ -177,6 +187,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, _input1, _input2) { eps = _eps; + inplace_residual = _inplace_residual; inputs[0] = _input1; inputs[1] = _input2; layer_guid = _layer_guid; @@ -234,6 +245,22 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model, } } +void ResidualRMSNorm::map_output_tensors(FFModel &ff) { + assert(numOutputs == 2); + assert(outputs[0]->get_volume() == inputs[0]->get_volume()); + if (inplace_residual) { + outputs[0]->parallel_is = inputs[0]->parallel_is; + outputs[0]->region = inputs[0]->region; + outputs[0]->part = inputs[0]->part; + outputs[0]->region_grad = inputs[0]->region_grad; + outputs[0]->part_grad = inputs[0]->part_grad; + // map output 1 to new region + ff.map_tensor(outputs[1], this); + } else { + Op::map_output_tensors(ff); + } +} + void ResidualRMSNorm::init(FFModel const &ff) { assert(check_output_input_weight_same_parallel_is()); parallel_is = outputs[0]->parallel_is; @@ -249,36 +276,44 @@ void ResidualRMSNorm::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(outputs[0]->part == inputs[0]->part); + assert(outputs[0]->region == inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -306,36 +341,45 @@ void ResidualRMSNorm::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -383,73 +427,131 @@ FutureMap 0 /*mapper_id*/, machine_view_hash); launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); + if (inplace_residual) { + assert(batch_outputs[0]->part == batch_inputs[0]->part); + assert(batch_outputs[0]->region == batch_inputs[0]->region); + } + int fid = 0; + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + inplace_residual ? READ_WRITE : READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part, 0 /*projection id*/, READ_ONLY, EXCLUSIVE, batch_inputs[1]->region)); - launcher.add_field(1, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(fid++, FID_DATA); + } launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[1]->region)); - launcher.add_field(3, FID_DATA); + launcher.add_field(fid++, FID_DATA); launcher.add_region_requirement(RegionRequirement(weights[0]->part, 0 /*projection id*/, - READ_WRITE, + READ_ONLY, EXCLUSIVE, weights[0]->region)); - launcher.add_field(4, FID_DATA); + launcher.add_field(fid++, FID_DATA); return runtime->execute_index_space(ctx, launcher); } /* - regions[0](I): input1 + regions[0](I/O): input1 / residual output regions[1](I): input2 - regions[2](O): residual output - regions[3](O): output - regions[4](I/O): weight + regions[2](O): output + regions[3](I): weight */ void ResidualRMSNorm::inference_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - assert(task->regions.size() == 5); - assert(regions.size() == 5); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; } ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); + assert(task->regions.size() == 5 - m->inplace_residual); + assert(regions.size() == 5 - m->inplace_residual); GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input1, input2, weight, residual_output, output); + + GenericTensorAccessorW residual_output, output; + GenericTensorAccessorR weight; + if (m->inplace_residual) { + // residual_output is mapped to the same region as the input + residual_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + output = helperGetGenericTensorAccessorWO(m->output_type[1], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + weight = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + } else { + residual_output = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + output = helperGetGenericTensorAccessorWO(m->output_type[1], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + weight = helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[4], + task->regions[4], + FID_DATA, + ctx, + runtime); + } + + inference_kernel_wrapper( + m, bc, input1, input2, weight, residual_output, output); + if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - ResidualRMSNorm::save_inference_tensors_to_file( - m, shard_id, bc, {input1, input2}, {weight}, {residual_output, output}); + if (m->inplace_residual) { + ResidualRMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input2}, {weight}, {residual_output, output}); + } else { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input1, input2}, + {weight}, + {residual_output, output}); + } } } @@ -459,6 +561,7 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.model_id); sez.serialize(this->eps); sez.serialize(this->dim); + sez.serialize(this->inplace_residual); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -479,6 +582,8 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); dez.deserialize(eps); dez.deserialize(dim); + int inplace_residual; + dez.deserialize(inplace_residual); size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); @@ -487,13 +592,285 @@ Node ResidualRMSNorm::deserialize(FFModel &ff, params.layer_guid = layer_guid; params.eps = eps; params.dim = dim; + params.inplace_residual = inplace_residual; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } void ResidualRMSNorm::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RESIDUAL_RMSNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): RMS output_grad + launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[1]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I): residual output / RMS input + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // regions[2](I/O): residual input grad 0 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + // regions[3](I/O): residual input grad 1 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(3, FID_DATA); + // regions[4](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(4, FID_DATA); + // regions[5](I/O): gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(5, FID_DATA); + + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): RMS output_grad + regions[1](I): Residual output / RMS input + regions[2](I/O): Residual input 0 grad + regions[3](I/O): Residual input 1 grad + regions[4](I): weight + regions[5](I/O): weight_grad +*/ +void ResidualRMSNorm::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 6); + assert(regions.size() == 6); + ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW residual_output_rms_input = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[1], + task->regions[1], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_input0_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW residual_input1_grad = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[3], + task->regions[3], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime); + backward_kernel_wrapper(m, + output_grad, + residual_output_rms_input, + residual_input0_grad, + residual_input1_grad, + weight, + weight_grad); } + +Legion::FutureMap + ResidualRMSNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + int fid = 0; + // residual input grad 0 + launcher.add_region_requirement(RegionRequirement( + batch_inputs[0]->part_grad, + 0 /*projection id*/, + inplace_residual && !reset_input_grads[0] ? READ_WRITE : WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // residual input grad 1 + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(fid++, FID_DATA); + if (!inplace_residual && !reset_input_grads[0]) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(fid++, FID_DATA); + } + // RMS output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[1]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[1]->region_grad)); + launcher.add_field(fid++, FID_DATA); + // gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(fid++, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): RMS output_grad + regions[1](I/O): Residual input 0 grad + regions[2](I/O): Residual input 1 grad + regions[3](I): weight +*/ +void ResidualRMSNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args); + int expected_regions = + (m->inplace_residual || m->reset_input_grads[0]) ? 4 : 5; + assert(task->regions.size() == expected_regions); + assert(regions.size() == expected_regions); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + + int rid = 0, t_rid = 0; + GenericTensorAccessorW input_grad_0 = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorW input_grad_1 = + helperGetGenericTensorAccessorRW(m->input_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + + GenericTensorAccessorR output_grad_0; + if (!m->reset_input_grads[0]) { + if (m->inplace_residual) { + // mapped to input 0 + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[0], + task->regions[0], + FID_DATA, + ctx, + runtime); + } else { + output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + } + } + GenericTensorAccessorR output_grad_1 = + helperGetGenericTensorAccessorRO(m->output_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + GenericTensorAccessorR weight = + helperGetGenericTensorAccessorRO(m->weight_type[0], + regions[rid++], + task->regions[t_rid++], + FID_DATA, + ctx, + runtime); + + peft_bwd_kernel_wrapper( + m, bc, output_grad_0, output_grad_1, input_grad_0, input_grad_1, weight); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + if (!m->reset_input_grads[0]) { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_0, output_grad_1}, + false); + } else { + ResidualRMSNorm::save_inference_tensors_to_file( + m, + shard_id, + bc, + {input_grad_0, input_grad_1}, + {weight}, + {output_grad_1}, + false); + } + } +} + Op *ResidualRMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { @@ -516,6 +893,7 @@ size_t hash::operator()( hash_combine(key, params.eps); hash_combine(key, params.layer_guid.id); hash_combine(key, params.dim); + hash_combine(key, params.inplace_residual); return key; } }; // namespace std diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc index 79dce65c57..8dadd7dcc3 100644 --- a/src/ops/rms_norm.cc +++ b/src/ops/rms_norm.cc @@ -53,7 +53,7 @@ RMSNormParams RMSNorm::get_params() const { params.layer_guid = this->layer_guid; params.eps = this->eps; params.dim = this->dim; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -422,7 +422,7 @@ void RMSNorm::inference_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - forward_kernel_wrapper(m, input, weight, output); + inference_kernel_wrapper(m, bc, input, weight, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -431,6 +431,166 @@ void RMSNorm::inference_task(Task const *task, } } +void RMSNorm::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(RMSNORM_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // regions[0](I): output_grad + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I): input + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // regions[2](I/O): input_grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + // regions[3](I): gamma + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(3, FID_DATA); + // regions[4](I/O): gamma_grad + launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + weights[0]->region_grad)); + launcher.add_field(4, FID_DATA); + + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I): input + regions[2](I/O): input_grad + regions[3](I): weight + regions[4](I/O): weight_grad +*/ +void RMSNorm::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 5); + assert(regions.size() == 5); + RMSNormMeta const *m = *((RMSNormMeta **)task->local_args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW( + m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime); + backward_kernel_wrapper( + m, output_grad, input, input_grad, weight, weight_grad); +} + +Legion::FutureMap + RMSNorm::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // regions[0](I): output_grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // regions[1](I/O): input_grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + // regions[2](I): weight + launcher.add_region_requirement(RegionRequirement(weights[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + weights[0]->region)); + launcher.add_field(2, FID_DATA); + + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output_grad + regions[1](I/O): input_grad + regions[2](I): weight +*/ +void RMSNorm::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == 3); + assert(regions.size() == 3); + RMSNormMeta *m = *((RMSNormMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( + m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); + peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + RMSNorm::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + } +} + void RMSNorm::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); @@ -474,11 +634,9 @@ Op *RMSNorm::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { RMSNormParams params = get_params(); - return new RMSNorm(ff, params, inputs[0], true, this->name); + return new RMSNorm(ff, params, inputs[0], true, params.name); } -void RMSNorm::backward(FFModel const &ff) {} - bool RMSNorm::measure_operator_cost(Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc index b38c68843b..0358a2cd31 100644 --- a/src/ops/sampling.cc +++ b/src/ops/sampling.cc @@ -88,7 +88,7 @@ Op *Sampling::create_operator_from_layer( SamplingParams Sampling::get_params() const { SamplingParams params; params.top_p = this->top_p; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -302,7 +302,7 @@ InferenceResult GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO( DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime); - int batch_size = bc->num_active_tokens(); + int batch_size = bc->num_active_infr_tokens(); Sampling::forward_kernel_wrapper(m, input, indices, batch_size); if (m->inference_debugging) { @@ -313,7 +313,7 @@ InferenceResult } InferenceResult ir; - download_tensor( + copy_tensor_dev_to_host( indices.get_int32_ptr(), ir.token_ids, batch_size); return ir; } diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc index 3d1c8d9094..e7c2fea19c 100644 --- a/src/ops/sigmoid_silu_multi.cc +++ b/src/ops/sigmoid_silu_multi.cc @@ -52,7 +52,7 @@ bool SigmoidSiluMultiParams::is_valid( SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const { SigmoidSiluMultiParams params; params.layer_guid = this->layer_guid; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -254,7 +254,188 @@ void SigmoidSiluMulti::forward(FFModel const &ff) { } void SigmoidSiluMulti::backward(FFModel const &ff) { - assert(false); + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + set_argumentmap_for_backward(ff, argmap); + IndexLauncher launcher(SIGMOID_SILU_MULTI_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + // output grad + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // input 1 + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(1, FID_DATA); + // input 2 + launcher.add_region_requirement(RegionRequirement(inputs[1]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[1]->region)); + launcher.add_field(2, FID_DATA); + // input 1 grad + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(3, FID_DATA); + // input 2 grad + launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[1]->region_grad)); + launcher.add_field(4, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output grad + regions[1](I): input 1 + regions[2](I): input 2 + regions[3](I/O): input 1 grad + regions[4](I/O): input 2 grad +*/ +void SigmoidSiluMulti::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 5); + + SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO( + m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); + GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime); + GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime); + + SigmoidSiluMulti::backward_kernel_wrapper( + m, output_grad, input1, input2, input1_grad, input2_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + SigmoidSiluMulti::save_inference_tensors_to_file( + m, + shard_id, + nullptr, + {output_grad, input1, input2}, + {}, + {input1_grad, input2_grad}); + } +} + +FutureMap + SigmoidSiluMulti::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + // output grad + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + // input 1 grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + // input 2 grad + launcher.add_region_requirement( + RegionRequirement(batch_inputs[1]->part_grad, + 0 /*projection id*/, + reset_input_grads[1] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[1]->region_grad)); + launcher.add_field(2, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/* + regions[0](I): output grad + regions[3](I/O): input 1 grad + regions[4](I/O): input 2 grad +*/ +void SigmoidSiluMulti::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + + assert(task->regions.size() == regions.size()); + assert(regions.size() == 3); + + SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW( + m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime); + + SigmoidSiluMulti::peft_bwd_kernel_wrapper( + m, bc, output_grad, input1_grad, input2_grad); + + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + SigmoidSiluMulti::save_inference_tensors_to_file(m, + shard_id, + nullptr, + {input1_grad, input2_grad}, + {}, + {output_grad}, + false); + } } FutureMap SigmoidSiluMulti::inference( @@ -347,7 +528,7 @@ void SigmoidSiluMulti::inference_task( assert(input1_domain == input2_domain); assert(input1_domain == output_domain); - SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output); + SigmoidSiluMulti::inference_kernel_wrapper(m, bc, input1, input2, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index 7b7f30a288..ceaa1a7788 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -23,7 +23,7 @@ namespace FlexFlow { SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ssm, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ssm) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; } @@ -34,36 +34,56 @@ SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) { } } -__device__ __forceinline__ float sigmoid_float(float x) { - return 1.0 / (1.0 + expf(-x)); -} - -__device__ __forceinline__ half sigmoid_half(half x) { - return (half)1.0 / ((half)1.0 + hexp(-x)); -} - -__global__ void SigmoidSiluMultiKernelFloat(int num_elements, - float const *input1_ptr, - float const *input2_ptr, - float *output_ptr) { +template +__global__ void SigmoidSiluMultiKernel(int num_elements, + T const *input1_ptr, + T const *input2_ptr, + T *output_ptr) { CUDA_KERNEL_LOOP(i, num_elements) { - output_ptr[i] = - input1_ptr[i] * sigmoid_float(input1_ptr[i]) * input2_ptr[i]; + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + output_ptr[i] = input1_ptr[i] * T(sigmoid_val) * input2_ptr[i]; } } -__global__ void SigmoidSiluMultiKernelHalf(int num_elements, - half const *input1_ptr, - half const *input2_ptr, - half *output_ptr) { +template +__global__ void SigmoidSiluMultiBackwardKernel(int num_elements, + T const *output_grad_ptr, + T const *input1_ptr, + T const *input2_ptr, + T *input1_grad_ptr, + T *input2_grad_ptr, + bool reset_input_grad1, + bool reset_input_grad2) { CUDA_KERNEL_LOOP(i, num_elements) { - output_ptr[i] = input1_ptr[i] * sigmoid_half(input1_ptr[i]) * input2_ptr[i]; + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + + if (reset_input_grad2) { + input2_grad_ptr[i] = + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } else { + input2_grad_ptr[i] += + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } + T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; + if (reset_input_grad1) { + input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val); + } else { + input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + } + T sig_grad = ss_grad_val * input1_ptr[i]; + + float x1_grad_val = static_cast(sig_grad); + x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val); + input1_grad_ptr[i] += T(x1_grad_val); } } /*static*/ void SigmoidSiluMulti::inference_kernel_wrapper( - SigmoidSiluMultiMeta const *m, + SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output) { @@ -81,8 +101,84 @@ void SigmoidSiluMulti::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t input_tensor_size = + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; + size_t activation_size_needed = + 2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(hipMemcpyAsync(m->input_activation, + input1.get_float_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + checkCUDA(hipMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_float_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(hipMemcpyAsync(m->input_activation, + input1.get_half_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + checkCUDA(hipMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_half_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + hipMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->input_type[0] == DT_FLOAT) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelFloat), + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel), GET_BLOCKS(num_elements), min(CUDA_NUM_THREADS, num_elements), 0, @@ -92,7 +188,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( input2.get_float_ptr(), output.get_float_ptr()); } else if (m->input_type[0] == DT_HALF) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelHalf), + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel), GET_BLOCKS(num_elements), min(CUDA_NUM_THREADS, num_elements), 0, @@ -116,4 +212,159 @@ void SigmoidSiluMulti::inference_kernel_wrapper( } } +/*static*/ +void SigmoidSiluMulti::backward_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input1, + GenericTensorAccessorR const &input2, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + int num_elements = output_grad.domain.get_volume(); + assert(input1.domain.get_volume() == num_elements); + assert(input2.domain.get_volume() == num_elements); + assert(input1_grad.domain.get_volume() == num_elements); + assert(input2_grad.domain.get_volume() == num_elements); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + if (m->input_type[0] == DT_FLOAT) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + input1.get_float_ptr(), + input2.get_float_ptr(), + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + input1.get_half_ptr(), + input2.get_half_ptr(), + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed); + } +} + +/*static*/ +void SigmoidSiluMulti::peft_bwd_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume()); + assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume()); + + hipEvent_t t_start, t_end; + if (m->profiling) { + checkCUDA(hipEventCreate(&t_start)); + checkCUDA(hipEventCreate(&t_end)); + checkCUDA(hipEventRecord(t_start, stream)); + } + + int num_peft_requests = 0; + int num_peft_tokens = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + } + } + if (num_peft_requests == 0) { + // No PEFT requests + return; + } else { + // Otherwise assume at most 1 peft request + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + } + int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + int num_elements = in_dim * num_peft_tokens; + + if (m->input_type[0] == DT_FLOAT) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + num_elements, + output_grad.get_float_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel), + GET_BLOCKS(num_elements), + min(CUDA_NUM_THREADS, num_elements), + 0, + stream, + num_elements, + output_grad.get_half_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + checkCUDA(hipEventRecord(t_end, stream)); + checkCUDA(hipEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end)); + checkCUDA(hipEventDestroy(t_start)); + checkCUDA(hipEventDestroy(t_end)); + printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 590b641b5a..929d557a17 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -22,7 +22,7 @@ namespace FlexFlow { SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, SigmoidSiluMulti const *ssm, MemoryAllocator &gpu_mem_allocator) - : OpMeta(handle) { + : OpMeta(handle, ssm) { profiling = ssm->profiling; inference_debugging = ssm->inference_debugging; } @@ -45,9 +45,44 @@ __global__ void SigmoidSiluMultiKernel(int num_elements, } } +template +__global__ void SigmoidSiluMultiBackwardKernel(int num_elements, + T const *output_grad_ptr, + T const *input1_ptr, + T const *input2_ptr, + T *input1_grad_ptr, + T *input2_grad_ptr, + bool reset_input_grad1, + bool reset_input_grad2) { + CUDA_KERNEL_LOOP(i, num_elements) { + float sigmoid_val = static_cast(input1_ptr[i]); + sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val)); + + if (reset_input_grad2) { + input2_grad_ptr[i] = + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } else { + input2_grad_ptr[i] += + output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val)); + } + T ss_grad_val = output_grad_ptr[i] * input2_ptr[i]; + if (reset_input_grad1) { + input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val); + } else { + input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val); + } + T sig_grad = ss_grad_val * input1_ptr[i]; + + float x1_grad_val = static_cast(sig_grad); + x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val); + input1_grad_ptr[i] += T(x1_grad_val); + } +} + /*static*/ void SigmoidSiluMulti::inference_kernel_wrapper( - SigmoidSiluMultiMeta const *m, + SigmoidSiluMultiMeta *m, + BatchConfig const *bc, GenericTensorAccessorR const &input1, GenericTensorAccessorR const &input2, GenericTensorAccessorW const &output) { @@ -64,6 +99,83 @@ void SigmoidSiluMulti::inference_kernel_wrapper( cudaEventCreate(&t_end); cudaEventRecord(t_start, stream); } + + // save input activation if needed for PEFT + if (bc->num_active_peft_tokens() > 0) { + // Check that we have at most one request that requires peft_bwd + int num_peft_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + } + } + assert(num_peft_requests <= 1); + + int tokens_previous_requests = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + // Skip non-PEFT requests + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // FIXME: use the new approach to computing token offset + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; + continue; + } + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; + if (bc->requestsInfo[i].peft_bwd) { + size_t input_tensor_size = + data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; + size_t activation_size_needed = + 2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; + if (activation_size_needed > m->allocated_peft_buffer_size) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->input_activation = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size = activation_size_needed; + } + // copy input activation + if (m->input_type[0] == DT_FLOAT) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + input1.get_float_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + checkCUDA(cudaMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_float_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + } else if (m->input_type[0] == DT_HALF) { + checkCUDA(cudaMemcpyAsync(m->input_activation, + input1.get_half_ptr() + + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + checkCUDA(cudaMemcpyAsync( + (void *)((char *)m->input_activation + input_tensor_size), + input2.get_half_ptr() + tokens_previous_requests * in_dim, + input_tensor_size, + cudaMemcpyDeviceToDevice, + stream)); + } else { + assert(false && "unsupport datatype in layernorm"); + } + } + } + } + if (m->input_type[0] == DT_FLOAT) { SigmoidSiluMultiKernel<<profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + if (m->input_type[0] == DT_FLOAT) { + SigmoidSiluMultiBackwardKernel<<>>(output_grad.domain.get_volume(), + output_grad.get_float_ptr(), + input1.get_float_ptr(), + input2.get_float_ptr(), + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + SigmoidSiluMultiBackwardKernel<<>>(output_grad.domain.get_volume(), + output_grad.get_half_ptr(), + input1.get_half_ptr(), + input2.get_half_ptr(), + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed); + } +} + +/*static*/ +void SigmoidSiluMulti::peft_bwd_kernel_wrapper( + SigmoidSiluMultiMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input1_grad, + GenericTensorAccessorW const &input2_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume()); + assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume()); + + cudaEvent_t t_start, t_end; + if (m->profiling) { + cudaEventCreate(&t_start); + cudaEventCreate(&t_end); + cudaEventRecord(t_start, stream); + } + + int num_peft_requests = 0; + int num_peft_tokens = 0; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; + } + if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + if (bc->requestsInfo[i].peft_bwd) { + num_peft_requests++; + num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + } + } + if (num_peft_requests == 0) { + // No PEFT requests + return; + } else { + // Otherwise assume at most 1 peft request + assert(num_peft_requests == 1); + assert(num_peft_tokens >= 1); + } + int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + int num_elements = in_dim * num_peft_tokens; + + if (m->input_type[0] == DT_FLOAT) { + SigmoidSiluMultiBackwardKernel<<>>( + num_elements, + output_grad.get_float_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_float_ptr(), + input2_grad.get_float_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else if (m->input_type[0] == DT_HALF) { + SigmoidSiluMultiBackwardKernel<<>>( + num_elements, + output_grad.get_half_ptr(), + static_cast(m->input_activation), + static_cast(m->input_activation) + + num_peft_tokens * in_dim, + input1_grad.get_half_ptr(), + input2_grad.get_half_ptr(), + m->reset_input_grads[0], + m->reset_input_grads[1]); + } else { + assert(false && "unsupport datatype in SigmoidSiluMulti"); + } + if (m->profiling) { + cudaEventRecord(t_end, stream); + checkCUDA(cudaEventSynchronize(t_end)); + float elapsed = 0; + checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end)); + cudaEventDestroy(t_start); + cudaEventDestroy(t_end); + printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed); + } +} + }; // namespace FlexFlow diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index 03618423be..a02d88b98b 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -86,7 +86,7 @@ SoftmaxParams Softmax::get_params() const { SoftmaxParams params; params.layer_guid = this->layer_guid; params.dim = this->dim; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -270,52 +270,12 @@ OpMeta *Softmax::init_task(Task const *task, domain = input_domain; } SoftmaxMeta *m = new SoftmaxMeta(handle, softmax, domain); - m->input_type = softmax->inputs[0]->data_type; - m->output_type = softmax->outputs[0]->data_type; // checkCUDNN(cudnnCreateTensorDescriptor(&m->outputTensor)); std::strcpy(m->op_name, softmax->name); m->layer_guid = softmax->layer_guid; return m; } -FutureMap Softmax::inference(FFModel const &ff, - BatchConfigFuture const &bc, - std::vector const &batch_inputs, - std::vector const &batch_outputs, - MachineView const *mv) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = batch_outputs[0]->parallel_is; - MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; - set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - size_t machine_view_hash = view->hash(); - /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv - << std::endl; */ - IndexLauncher launcher(SOFTMAX_INF_TASK_ID, - parallel_is, - TaskArgument(nullptr, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - machine_view_hash); - launcher.add_future(bc); - launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - batch_inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - batch_outputs[0]->region)); - launcher.add_field(1, FID_DATA); - return runtime->execute_index_space(ctx, launcher); -} - void Softmax::forward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -354,17 +314,11 @@ void Softmax::forward_task(Task const *task, ctx, task->regions[0].region.get_index_space()); SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - if (m->output_type == DT_HALF) { - forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); - } else if (m->output_type == DT_FLOAT) { - forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else { - assert(false && "Unsupported data type"); - } + forward_kernel_wrapper(m, input, output); } void Softmax::backward(FFModel const &ff) { @@ -402,52 +356,69 @@ void Softmax::backward_task(Task const *task, Domain in_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - switch (in_domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: \ - if (m->output_type == DT_HALF) { \ - return backward_task_with_dim(task, regions, ctx, runtime); \ - } else if (m->output_type == DT_FLOAT) { \ - return backward_task_with_dim(task, regions, ctx, runtime); \ - } else { \ - assert(false && "Unsupported data type"); \ - } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - assert(false); - } + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + backward_kernel_wrapper(m, input_grad, output_grad); } -/* - regions[0](I/O): input_grad - regions[1](I): output_grad -*/ -// Note that the backward task of softmax is actually a no op (i.e., input_grad -// = output_grad) since the upstream cross_entropy_loss function computes -// performs softmax_cross_entropy_loss to avoid intermediate zeros -template -void Softmax::backward_task_with_dim(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - // const Softmax* softmax = (Softmax*) task->args; - SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args); - TensorAccessorW acc_input_grad(regions[0], - task->regions[0], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - TensorAccessorR acc_output_grad( - regions[1], task->regions[1], FID_DATA, ctx, runtime); - // make sure the image indices match! - assert(acc_input_grad.rect == acc_output_grad.rect); - - backward_kernel_wrapper( - m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume()); +FutureMap Softmax::inference(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(SOFTMAX_INF_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + // if this is the last operator, we add the region below in order to copy the + // output to the grad tensor + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + int last_op = ff.operators.size() - 1; + assert(ff.operators[last_op]->op_type == OP_ARGMAX || + ff.operators[last_op]->op_type == OP_ARG_TOPK || + ff.operators[last_op]->op_type == OP_SAMPLING); + last_op -= 1; + while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { + last_op -= 1; + } + if (ff.operators[last_op] == this) { + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(2, FID_DATA); + } + return runtime->execute_index_space(ctx, launcher); } void Softmax::inference_task(Task const *task, @@ -455,8 +426,8 @@ void Softmax::inference_task(Task const *task, Context ctx, Runtime *runtime) { assert(task->regions.size() == regions.size()); - assert(regions.size() == 2); - assert(task->regions.size() == 2); + assert(regions.size() == 3 || regions.size() == 2); + bool is_last_op = (regions.size() == 3); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); if (bc->num_tokens == 0) { return; @@ -465,16 +436,19 @@ void Softmax::inference_task(Task const *task, ctx, task->regions[0].region.get_index_space()); SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); - if (m->output_type == DT_HALF) { - forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr()); - } else if (m->output_type == DT_FLOAT) { - forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr()); - } else { - assert(false && "Unsupported data type"); + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + GenericTensorAccessorW output_grad; + if (is_last_op) { + output_grad = helperGetGenericTensorAccessorWO(m->output_type[0], + regions[2], + task->regions[2], + FID_DATA, + ctx, + runtime); } + inference_kernel_wrapper(m, bc, is_last_op, input, output, output_grad); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; @@ -483,6 +457,73 @@ void Softmax::inference_task(Task const *task, } } +FutureMap Softmax::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view; + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv + << std::endl; */ + IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + reset_input_grads[0] ? WRITE_ONLY : READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +void Softmax::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(task->regions.size() == regions.size()); + assert(regions.size() == 2); + assert(task->regions.size() == 2); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + Domain in_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + Softmax::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); + } +} + bool Softmax::get_int_parameter(PMParameter para, int *value) const { switch (para) { case PM_SOFTMAX_DIM: @@ -508,29 +549,35 @@ bool Softmax::measure_operator_cost(Simulator *sim, sim->free_all(); float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + GenericTensorAccessorR input_acc(DT_FLOAT, sub_input.get_domain(), input_ptr); assert(input_ptr != NULL); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorW output_acc( + DT_FLOAT, sub_output.get_domain(), output_ptr); assert(output_ptr != NULL); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); std::function forward, backward; - forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); }; + forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); }; if (sim->computationMode == COMP_MODE_TRAINING) { float *input_grad_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); + GenericTensorAccessorW input_grad_acc( + DT_FLOAT, sub_input.get_domain(), input_grad_ptr); assert(input_grad_ptr != NULL); cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); float *output_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); + GenericTensorAccessorW output_grad_acc( + DT_FLOAT, sub_output.get_domain(), output_grad_ptr); assert(output_grad_ptr != NULL); cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); backward = [&] { - backward_kernel_wrapper( - m, input_grad_ptr, output_grad_ptr, sub_output.get_volume()); + backward_kernel_wrapper(m, input_grad_acc, output_grad_acc); }; } diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 68d3a4c205..52da51fb26 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -850,7 +850,7 @@ SpecIncMultiHeadSelfAttentionParams params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index b1687d12a2..aebd5e8892 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -141,7 +141,7 @@ template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, hipStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; // printf("curr depth: %d\n", curr_depth); // assert(curr_depth < 3); @@ -200,15 +200,16 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; // int qkv_block_size = diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index a00ea9c95f..4688a8233c 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -361,7 +361,7 @@ template void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, cudaStream_t stream) { - int num_tokens = bc->num_active_tokens(); + int num_tokens = bc->num_active_infr_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; @@ -471,17 +471,18 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; @@ -541,20 +542,9 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, DT const *A = static_cast
(m->devQKVProjArray) + bc->requestsInfo[i].first_token_offset_in_batch * m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - - // print_tensor((float*)A, 32, "A"); DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + DT *C = static_cast
(m->qk_prods); - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, @@ -854,29 +844,15 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { beam_token_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo)); - + static_cast( + handler.batch_config_metadata->beamTokenInfo); beam_request_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo)); - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BeamSearchBatchConfig::beamTokenInfo) + - sizeof(BeamSearchBatchConfig::beamRequestsInfo) + - sizeof(BatchConfig::causalMask)); + static_cast( + handler.batch_config_metadata->beamRequestsInfo); + causalMask = static_cast( + handler.batch_config_metadata->causalMask); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } cudaStreamSynchronize(stream); diff --git a/src/ops/split.cc b/src/ops/split.cc index 7c6b631b20..92cfbd49e9 100644 --- a/src/ops/split.cc +++ b/src/ops/split.cc @@ -50,7 +50,7 @@ SplitParams Split::get_params() const { SplitParams params; params.splits = this->splits; params.legion_axis = this->legion_axis; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/topk.cc b/src/ops/topk.cc index 7d30a8aff3..0e88befa68 100644 --- a/src/ops/topk.cc +++ b/src/ops/topk.cc @@ -87,7 +87,7 @@ TopKParams TopK::get_params() const { TopKParams params; params.k = this->k; params.sorted = this->sorted; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -226,7 +226,7 @@ OpMeta *TopK::init_task(Task const *task, Runtime *runtime) { TopK *topk = (TopK *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - TopKMeta *m = new TopKMeta(handle); + TopKMeta *m = new TopKMeta(handle, topk); m->profiling = topk->profiling; m->inference_debugging = topk->inference_debugging; m->sorted = topk->sorted; @@ -474,7 +474,7 @@ bool TopK::measure_operator_cost(Simulator *sim, return false; } - TopKMeta *m = new TopKMeta(sim->handler); + TopKMeta *m = new TopKMeta(sim->handler, this); m->sorted = sorted; // allocate diff --git a/src/ops/topk.cpp b/src/ops/topk.cpp index b6e898b654..303c6e85e9 100644 --- a/src/ops/topk.cpp +++ b/src/ops/topk.cpp @@ -513,6 +513,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m, // TODO: missing profiling here } -TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {} +TopKMeta::TopKMeta(FFHandler handler, TopK const *topk) + : OpMeta(handler, topk) {} }; // namespace FlexFlow diff --git a/src/ops/topk.cu b/src/ops/topk.cu index cc87ee8a42..cfb2bf6448 100644 --- a/src/ops/topk.cu +++ b/src/ops/topk.cu @@ -509,6 +509,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m, } } -TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {} +TopKMeta::TopKMeta(FFHandler handler, TopK const *topk) + : OpMeta(handler, topk) {} }; // namespace FlexFlow diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc index 7a179c4f7d..bffde477de 100644 --- a/src/ops/transpose.cc +++ b/src/ops/transpose.cc @@ -51,7 +51,7 @@ TransposeParams Transpose::get_params() const { for (int i = 0; i < outputs[0]->num_dims; i++) { params.perm.push_back(this->perm[i]); } - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -193,7 +193,7 @@ OpMeta *Transpose::init_task(Task const *task, Domain out_domain = runtime->get_index_space_domain( ctx, task->regions[1].region.get_index_space()); - TransposeMeta *m = new TransposeMeta(handle); + TransposeMeta *m = new TransposeMeta(handle, transpose); transpose->init_meta(m, in_domain, out_domain); m->profiling = transpose->profiling; m->inference_debugging = transpose->inference_debugging; @@ -320,7 +320,7 @@ bool Transpose::measure_operator_cost(Simulator *sim, return false; } - TransposeMeta *m = sim->transpose_meta; + TransposeMeta *m = new TransposeMeta(sim->handler, this); this->init_meta(m, sub_input.get_domain(), sub_output.get_domain()); sim->free_all(); diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index df722a3d51..132a48be40 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -924,7 +924,7 @@ TreeIncMultiHeadSelfAttentionParams params.qk_prod_scaling = this->qk_prod_scaling; params.position_bias = this->position_bias; params.tensor_parallelism_degree = this->tensor_parallelism_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 26291fb3b4..890d32bc87 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -16,6 +16,8 @@ #include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" +#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/hip_helper.h" #include #include @@ -26,11 +28,333 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +#define WARP_SIZE 32 + using namespace Kernels::IncMultiHeadAttention; namespace Kernels { namespace TreeIncMultiHeadAttention { +template +__device__ __forceinline__ T + WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_sync(mask, var, srcLane, width); +#else + return __shfl(var, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T + WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(mask, var, laneMask, width); +#else + return __shfl_xor(var, laneMask, width); +#endif +} + +template +__global__ void compute_attention_kernel_fused_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int const max_token_per_batch, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + int num_heads, + int num_requests, + BatchConfig::BitMask *causalMask, + bool *request_completed, + int qk_smem_sz) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + int const first_step = 0; + + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + int const qlength = + request_infos[batch_config_request_id].num_tokens_in_batch; + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; + + int first_token_idx = 0; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += + request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch; + } + + bool prompt_phase = request_infos[batch_config_request_id].prompt_phase; + int q_start = + request_infos[batch_config_request_id].first_token_depth_in_request; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_ + qk_smem_sz); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < qlength; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + + // if (head_idx == 0 && request_idx == 1 && tidx == 0) { + // printf("laod q %d, %d %.10f\n", + // request_idx, + // qi,q_vecs[ki_o][ii].x); + // } + } + + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + bool const mask = + prompt_phase ? (qi + q_start < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); + + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + + // if (head_idx == 0 && !mask) { + // printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f, + // %.10f, %d\n", + // request_idx, + // qi, + // ti, + // qk, + // q_vecs[ki_o][0].x, + // k[0].x, + // bitmask.non_tree_cache_size); + // } + qk_smem[ti - first_step] = mask ? 0.0f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0); + + // if (head_idx == 0 && qi == 9 && tidx == 0) { + // printf("tree attn first token qk_max %f\n", qk_max); + // } + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + bool const mask = + prompt_phase ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti]; + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + if (ti < tlength) { + bool const mask = + prompt_phase + ? (q_start + qi < ti) + : (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << qi)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) { + // printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n", + // out.x, + // out.y, + // out.z, + // out.w, + // vi, + // (first_token_idx + qi) * hidden_size + head_idx * + // per_head_size + + // vi); + // } + } + } +} + template __global__ void commit_tokens_kernel( DT const *devQKVProjArray, @@ -45,15 +369,15 @@ __global__ void commit_tokens_kernel( int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) { + CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) { - int token_pos = i / (hidden_size * KV_WEIGHT_NUM); + int token_pos = i / (hidden_size); int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index; int offset = i % hidden_size; assert(token_idx_in_last_batch < num_active_tokens_in_last_batch); - size_t val_idx = - token_idx_in_last_batch * 3 * hidden_size + hidden_size + offset; + size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size + + hidden_size + offset; DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; @@ -89,8 +413,9 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch - BatchConfig::max_sequence_length(), + m->num_active_infr_tokens, // number of active tokens in previous batch + BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(), m->hidden_size); } } @@ -109,12 +434,15 @@ __global__ void update_tree_branch_kv_cache( int total_tokens_in_batch, int max_seq_len, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) { + + int token_idx = i / (hidden_size); int offset = i % hidden_size; token_idx += processed_tokens_in_batch; // get index in the whole batch - size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; @@ -127,6 +455,53 @@ __global__ void update_tree_branch_kv_cache( } } +template +__global__ void update_tree_branch_kv_cache_fused( + DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos, + BatchConfig::PerRequestInfo *request_infos, + int qProjSize, + int kProjSize, + int vProjSize, + int num_new_tokens, + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) { + + int token_idx = i / hidden_size; + int offset = i % hidden_size; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + + int const req_id = tokenInfos[token_idx].request_index; + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + int const request_token_offset = + request_infos[req_id].first_token_offset_in_batch; + int const first_token_depth = + request_infos[req_id].first_token_depth_in_request; + + // if(i % hidden_size == 0){ + // printf("update token request id: %d, %d, %d real id %d, value%.10f\n", + // req_id, token_idx, request_token_offset,(token_idx + first_token_depth + // - request_token_offset), kVal); + // } + kCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + + (token_idx + first_token_depth - request_token_offset) * + hidden_size + + offset] = vVal; + } +} + template __global__ void tree_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, @@ -157,13 +532,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) hipblasDatatype_t compute_type = hipblas_data_type; -#else - // TODO: currently use the hipblas_data_type - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - hipblasDatatype_t compute_type = hipblas_data_type; -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // hipblasDatatype_t compute_type = hipblas_data_type; + // #else + // // TODO: currently use the hipblas_data_type + // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // hipblasDatatype_t compute_type = hipblas_data_type; + // #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = @@ -171,16 +547,20 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(); int vt_block_size = m->vProjSize; int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { continue; } + assert(processed_tokens_in_batch == + bc->requestsInfo[i].first_token_offset_in_batch); int last_token_idx_of_the_request = processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; while (processed_tokens_in_batch <= last_token_idx_of_the_request) { @@ -213,7 +593,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch BatchConfig::max_sequence_length(), m->hidden_size); } @@ -335,24 +715,23 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, MIOPEN_SOFTMAX_MODE_CHANNEL)); // Matmul softmax(QK^T/sqrt(d_k)) by V alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; + m_ = m->vProjSize; + n = num_new_tokens; k = total_tokens_in_request; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens_in_request; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens_in_request; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + // padding) - B = static_cast
(m->valueCache) + i * vt_req_block_size; + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous // requests C = static_cast
(m->attn_heads) + processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, HIPBLAS_OP_N, HIPBLAS_OP_T, @@ -376,45 +755,44 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, HIPBLAS_GEMM_DEFAULT)); - - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + - processed_tokens_in_batch * m->oProjSize; - - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); processed_tokens_in_batch += num_new_tokens; } // Before moving to the next request // check that we have finished all tokens of the request assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); } + // Project to output, save result directly on output tensor + DT alpha = 1.0f, beta = 0.0f; + int m_ = m->oProjSize; + int k = m->vProjSize * m->num_q_heads; + int n = processed_tokens_in_batch; + int lda = k, ldb = k, ldc = m_; + DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + + m->kProjSize * m->num_q_heads + + m->vProjSize * m->num_q_heads); + DT const *B = static_cast
(m->attn_heads); + DT *C = static_cast
(output_ptr); + + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + B, + hipblas_data_type, + ldb, + &beta, + C, + hipblas_data_type, + ldc, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + @@ -432,7 +810,85 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } - assert(processed_tokens_in_batch == bc->num_active_tokens()); + assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); +} + +#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_size_in_bytes_tree
(m->qProjSize, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::max_spec_tree_token_num(), \ + THDS_PER_VALUE, \ + THDS_PER_BLOCK, \ + bc, \ + smem_sz); \ + compute_attention_kernel_fused_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::BatchConfig::max_spec_tree_token_num(), \ + BatchConfig::max_tokens_per_batch(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->num_q_heads, \ + bc->num_active_requests(), \ + m->causalMask, \ + m->request_completed, \ + smem_sz[0]) + +template +void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m, + TreeVerifyBatchConfig const *bc, + DT *output_ptr, + hipStream_t stream) { + + // update the kv cache + // update K-V cache + int num_new_tokens = bc->num_active_tokens(); + int parallelism = m->hidden_size * num_new_tokens; + update_tree_branch_kv_cache_fused<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + m->request_infos, + m->qProjSize, + m->kProjSize, + m->vProjSize, + num_new_tokens, + BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(), + m->hidden_size); + + dim3 grid(m->num_q_heads, bc->num_active_requests()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + // 0->qk production size, 1->total shared size + int smem_sz[2]; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } } template @@ -461,21 +917,17 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } } // copy committed tokens info to GPU for the commit_tokens kernel - // Note that m->num_active_tokens stores the number of active + // Note that m->num_active_infr_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache - checkCUDA( - hipMemcpyAsync(m->committed_token_infos, - &(bc->committed_tokens), - bc->num_tokens_to_commit * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo), - hipMemcpyHostToDevice, - stream)); + // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << + // "\n"; + commit_tokens
(m, bc, stream); - // After commit we update m->num_active_tokens to be the number of active + // After commit we update m->num_active_infr_tokens to be the number of active // tokens for the current batch - m->num_active_tokens = bc->num_active_tokens(); + m->num_active_infr_tokens = bc->num_active_infr_tokens(); // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { @@ -483,12 +935,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); bias_ptr = static_cast
(m->bias_ptr); } - checkCUDA(hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - bc->num_active_tokens() * - sizeof(TreeVerifyBatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream)); // phase 1: Implement kernel to compute KQV for input tokens compute_qkv_kernel(m, bc, @@ -502,11 +948,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // phase 2: No need to update key/val cache // IncMultiHeadSelfAttention::update_kv_cache_kernel( // m, bc, stream); + // use the new kernel + compute_attention_kernel_fused
( + m, bc, static_cast
(m->attn_heads), stream); + + int processed_tokens_in_batch = bc->num_active_tokens(); - // phase 3: Compute attention score - // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + compute_o_prod_bias(m, + bc, + shard_id, + output_ptr, + weight_ptr, + bias_ptr, + processed_tokens_in_batch, + stream); } } // namespace TreeIncMultiHeadAttention @@ -622,34 +1077,21 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( _num_kv_heads, attn->quantization_type, attn->offload), - num_active_tokens(0) { + num_active_infr_tokens(0) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t committed_tokeninfo_size = max_tokens_per_batch; - size_t total_size = committed_tokeninfo_size * - sizeof(TreeVerifyBatchConfig::CommittedTokensInfo); - if (offload) { - // assert that we have enough reserved work space left - assert(gpu_mem_allocator.reserved_total_size - - gpu_mem_allocator.reserved_allocated_size >= - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_reserved( - committed_tokeninfo_size); - } else { - gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst, - total_size); - committed_token_infos = - gpu_mem_allocator - .allocate_instance( - committed_tokeninfo_size); - } + + causalMask = static_cast( + handler.batch_config_metadata->causalMask); + committed_token_infos = + static_cast( + handler.batch_config_metadata->committed_tokens); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } checkCUDA(hipStreamSynchronize(stream)); diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 50c056c816..86c53d7ea1 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -12,9 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) #include "cuComplex.h" -#endif #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" @@ -390,7 +388,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m, m->kProjSize, m->vProjSize, num_tokens_to_commit, - m->num_active_tokens, // number of active tokens in previous batch + m->num_active_infr_tokens, // number of active tokens in previous batch BatchConfig::max_sequence_length() + BatchConfig::max_spec_tree_token_num(), m->hidden_size); @@ -509,17 +507,18 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cudaDataType_t compute_type = cublas_data_type; -#else - // For best performance, set the default cublas compute type to - // CUBLAS_COMPUTE_16F for half precision and to - // CUBLAS_COMPUTE_32F_FAST_16F for full precision - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - if (m->output_type[0] == DT_FLOAT) { - compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - } -#endif + // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + // cudaDataType_t compute_type = cublas_data_type; + // #else + // // For best performance, set the default cublas compute type to + // // CUBLAS_COMPUTE_16F for half precision and to + // // CUBLAS_COMPUTE_32F_FAST_16F for full precision + // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + // if (m->output_type[0] == DT_FLOAT) { + // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; + // } + // #endif // int num_requests = bc->num_active_requests(); int processed_tokens_in_batch = 0; // int qkv_block_size = @@ -571,7 +570,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->vProjSize, num_new_tokens, // num_tokens_in_branch processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_tokens, // total_tokens_in_batch + m->num_active_infr_tokens, // total_tokens_in_batch BatchConfig::max_sequence_length(), m->hidden_size); } @@ -773,6 +772,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (*m->final_bias && shard_id == 0) { int parallelism = m->oProjSize * processed_tokens_in_batch; int qkv_weight_size = m->qProjSize * m->global_num_q_heads + @@ -788,7 +788,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, m->oProjSize); } - assert(processed_tokens_in_batch == bc->num_active_tokens()); + assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); } #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ @@ -896,7 +896,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, } // copy committed tokens info to GPU for the commit_tokens kernel - // Note that m->num_active_tokens stores the number of active + // Note that m->num_active_infr_tokens stores the number of active // tokens in the previous batch, which is needed for committing // keys/values to the key-value cache // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit << @@ -904,9 +904,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, commit_tokens
(m, bc, stream); - // After commit we update m->num_active_tokens to be the number of active + // After commit we update m->num_active_infr_tokens to be the number of active // tokens for the current batch - m->num_active_tokens = bc->num_active_tokens(); + m->num_active_infr_tokens = bc->num_active_infr_tokens(); // here because we need postion info in infernece 1 if (m->offload && m->biasSize > 0) { @@ -1052,7 +1052,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( _num_kv_heads, attn->quantization_type, attn->offload), - num_active_tokens(0) { + num_active_infr_tokens(0) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); @@ -1060,21 +1060,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - causalMask = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo)); + causalMask = static_cast( + handler.batch_config_metadata->causalMask); committed_token_infos = - reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + - sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask)); - - request_completed = reinterpret_cast( - reinterpret_cast(handler.batch_config_metadata) + - sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) + - sizeof(BatchConfig::causalMask) + - sizeof(TreeVerifyBatchConfig::committed_tokens)); + static_cast( + handler.batch_config_metadata->committed_tokens); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } cudaStreamSynchronize(stream); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 5d38e28903..52c4ec2e28 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::AllReduce; /* Params */ bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) { - return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim; + return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim && + std::strcmp(lhs.name, rhs.name) == 0; } bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { @@ -55,7 +56,7 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { AllReduceParams AllReduce::get_params() const { AllReduceParams params; params.allreduce_legion_dim = this->allreduce_dim; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -110,6 +111,7 @@ OpMeta *AllReduce::init_task(Task const *task, meta->input_type[0] = ar->inputs[0]->data_type; meta->output_type[0] = ar->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, ar->name); return meta; } @@ -146,6 +148,102 @@ void AllReduce::init(FFModel const &ff) { set_opmeta_from_futuremap(ff, fm); } +void AllReduce::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void AllReduce::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void AllReduce::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void AllReduce::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + void AllReduce::init_inference(FFModel const &ff, std::vector const &batch_inputs, std::vector const &batch_outputs, @@ -224,64 +322,103 @@ FutureMap AllReduce::inference(FFModel const &ff, return runtime->execute_index_space(ctx, launcher); } -void AllReduce::forward(FFModel const &ff) { - ArgumentMap argmap; - Context ctx = ff.config.lg_ctx; - Runtime *runtime = ff.config.lg_hlr; - parallel_is = outputs[0]->parallel_is; - assert(numOutputs == 1); - assert(numInputs == 1); - set_argumentmap_for_forward(ff, argmap); - IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID, - outputs[0]->parallel_is, - TaskArgument(NULL, 0), - argmap, - Predicate::TRUE_PRED, - false /*must*/, - 0 /*mapper_id*/, - outputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - inputs[0]->region)); - launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - 0 /*projection id*/, - WRITE_ONLY, - EXCLUSIVE, - outputs[0]->region)); - launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); +/*static*/ +void AllReduce::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta *m = *((AllReduceMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + inference_kernel_wrapper(m, bc, input, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + AllReduce::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {output}); + } } -void AllReduce::backward(FFModel const &ff) { +FutureMap AllReduce::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; assert(numOutputs == 1); assert(numInputs == 1); - IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID, - inputs[0]->parallel_is, - TaskArgument(NULL, 0), + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), argmap, Predicate::TRUE_PRED, false /*must*/, 0 /*mapper_id*/, - inputs[0]->machine_view.hash()); - launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, - 0 /*projection id*/, - READ_WRITE, - EXCLUSIVE, - inputs[0]->region_grad)); + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - outputs[0]->region_grad)); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); launcher.add_field(1, FID_DATA); - runtime->execute_index_space(ctx, launcher); + return runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void AllReduce::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + AllReduceMeta *m = *((AllReduceMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + AllReduce::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); + } } bool AllReduce::measure_operator_cost(Simulator *sim, @@ -318,62 +455,6 @@ bool AllReduce::append_parallel_op_info( return true; } -/*static*/ -void AllReduce::inference_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input.data_type == output.data_type); - inference_kernel_wrapper(m, bc, input, output); -} - -/*static*/ -void AllReduce::forward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - - GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input.data_type == output.data_type); - forward_kernel_wrapper(m, input, output); -} - -void AllReduce::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 2); - assert(task->regions.size() == 2); - AllReduceMeta const *m = *((AllReduceMeta **)task->local_args); - - GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( - m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( - m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); - - assert(input_grad.data_type == output_grad.data_type); - backward_kernel_wrapper(m, input_grad, output_grad); -} - }; // namespace FlexFlow namespace std { diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc index acc5c414c7..ce9c032350 100644 --- a/src/parallel_ops/combine.cc +++ b/src/parallel_ops/combine.cc @@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Combine; /* Params */ bool operator==(CombineParams const &lhs, CombineParams const &rhs) { return lhs.combine_legion_dim == rhs.combine_legion_dim && - lhs.combine_degree == rhs.combine_degree; + lhs.combine_degree == rhs.combine_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool CombineParams::is_valid(ParallelTensorShape const &input) const { @@ -58,7 +59,7 @@ CombineParams Combine::get_params() const { CombineParams params; params.combine_legion_dim = this->combine_dim; params.combine_degree = this->combine_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -102,10 +103,11 @@ OpMeta *Combine::init_task(Task const *task, Runtime *runtime) { Combine *cmb = (Combine *)task->args; FFHandler handle = *((FFHandler *)task->local_args); - CombineMeta *m = new CombineMeta(handle); + CombineMeta *m = new CombineMeta(handle, cmb); m->input_type[0] = cmb->inputs[0]->data_type; m->output_type[0] = cmb->outputs[0]->data_type; assert(m->input_type[0] == m->output_type[0]); + std::strcpy(m->op_name, cmb->name); return m; } @@ -202,12 +204,23 @@ void Combine::create_input_partition_inference( assert(ff.config.computationMode == COMP_MODE_INFERENCE); assert(batch_outputs[0]->part != LogicalPartition::NO_PART); assert(batch_inputs[0]->part != LogicalPartition::NO_PART); - // input_lp is a disjoint partition + // partition batch_inputs[0]->region into inference_input_lps[batch_inputs[0]] + // according to the partitioning of batch_outputs[0] (i.e. make the + // partitioned dimension whole again by combining the partitions) ff.create_disjoint_partition(batch_outputs[0]->num_dims, batch_outputs[0]->dims, batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // partition batch_outputs[0]->region_grad into + // inference_output_grad_lps[batch_outputs[0]] according to the partitioning + // of batch_inputs[0] (i.e. restore the partition in the dimension that was + // combined in the forward pass) + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } FutureMap Combine::inference(FFModel const &ff, @@ -226,7 +239,7 @@ FutureMap Combine::inference(FFModel const &ff, size_t machine_view_hash = mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); - IndexLauncher launcher(COMBINE_FWD_TASK_ID, + IndexLauncher launcher(COMBINE_INF_TASK_ID, batch_outputs[0]->parallel_is, TaskArgument(nullptr, 0), argmap, @@ -234,6 +247,7 @@ FutureMap Combine::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.add_future(bc); launcher.add_region_requirement( RegionRequirement(inference_input_lps[batch_inputs[0]], 0 /*projection id*/, @@ -278,6 +292,52 @@ void Combine::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Combine::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = inputs[0]->data_type; + + // Warning: we need to use batch_inputs[0] here, instead of the usual + // batch_outputs[0] + parallel_is = batch_inputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view; + + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(&data_type, sizeof(DataType)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(inference_output_grad_lps[batch_outputs[0]], + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + void Combine::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -357,6 +417,37 @@ tl::optional Combine::as_dot() const { return rf; } +/*static*/ +void Combine::inference_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + CombineMeta const *m = *((CombineMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + DataType data_type = m->input_type[0]; + if (m->inference_debugging) { + std::cout << "INF " << m->op_name << std::endl; + } + if (data_type == DT_HALF) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_FLOAT) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_DOUBLE) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT32) { + forward_task_with_type(task, regions, ctx, runtime); + } else if (data_type == DT_INT64) { + forward_task_with_type(task, regions, ctx, runtime); + } else { + assert(false && "Unsupported data type in Combine forward"); + } +} + /*static*/ void Combine::forward_task(Task const *task, std::vector const ®ions, @@ -400,6 +491,56 @@ void Combine::forward_task_with_type(Task const *task, forward_kernel
(input_ptr, output_ptr, output_domain.get_volume()); } +void Combine::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + // CombineMeta const *m = *((CombineMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + // TODO: figure out why m->output_type[0] or m->input_type[0] are not working + DataType data_type = *((DataType *)task->args); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + data_type, regions[1], task->regions[1], FID_DATA, ctx, runtime); + assert(input_grad.data_type == data_type); + assert(output_grad.domain == input_grad.domain); + CombineMeta const *m = *((CombineMeta **)task->local_args); + int shard_id = task->index_point.point_data[0]; + if (shard_id == 0 && m->inference_debugging) { + // m is null when shard_id > 0 for some reason + std::cout << "BWD " << m->op_name << std::endl; + } + if (data_type == DT_HALF) { + backward_kernel(output_grad.get_half_ptr(), + input_grad.get_half_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_FLOAT) { + backward_kernel(output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_DOUBLE) { + backward_kernel(output_grad.get_double_ptr(), + input_grad.get_double_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_INT32) { + backward_kernel(output_grad.get_int32_ptr(), + input_grad.get_int32_ptr(), + output_grad.domain.get_volume()); + } else if (data_type == DT_INT64) { + backward_kernel(output_grad.get_int64_ptr(), + input_grad.get_int64_ptr(), + output_grad.domain.get_volume()); + } else { + assert(false && "Unsupported data type in Combine backward"); + } +} + void Combine::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/parallel_ops/fused_parallel_op.cc b/src/parallel_ops/fused_parallel_op.cc index 1a76cbfc40..dec7b20fb2 100644 --- a/src/parallel_ops/fused_parallel_op.cc +++ b/src/parallel_ops/fused_parallel_op.cc @@ -59,7 +59,7 @@ FusedParallelOpParams FusedParallelOp::get_params() const { std::vector ops(std::begin(this->parallel_ops), std::end(this->parallel_ops)); params.parallel_ops = ops; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp index 8d7e20e395..7067035465 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cpp +++ b/src/parallel_ops/kernels/allreduce_kernels.cpp @@ -20,26 +20,23 @@ namespace FlexFlow { AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); - size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; - size_t num_elements = bc->num_tokens * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - num_elements, + input.domain.get_volume(), nccl_data_type, ncclSum, m->handle.ncclComm, @@ -49,19 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m, #endif } -void forward_kernel_wrapper(AllReduceMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - input.domain.get_volume(), + num_elements, nccl_data_type, ncclSum, m->handle.ncclComm, @@ -71,10 +76,29 @@ void forward_kernel_wrapper(AllReduceMeta const *m, #endif } -void backward_kernel_wrapper(AllReduceMeta const *m, +void peft_bwd_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); +#endif } } // namespace AllReduce diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 2c000137a1..3041f9adf9 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -13,32 +13,30 @@ * limitations under the License. */ +#include "flexflow/ffconst_utils.h" #include "flexflow/parallel_ops/kernels/allreduce_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace AllReduce { -void inference_kernel_wrapper(AllReduceMeta const *m, - BatchConfig const *bc, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void forward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); - size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; - size_t num_elements = bc->num_tokens * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - num_elements, + input.domain.get_volume(), nccl_data_type, ncclSum, m->handle.ncclComm, @@ -48,18 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m, #endif } -void forward_kernel_wrapper(AllReduceMeta const *m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void backward_kernel_wrapper(AllReduceMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(input.data_type == output.data_type); assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type); checkNCCL(ncclAllReduce(input.ptr, output.ptr, - input.domain.get_volume(), + num_elements, nccl_data_type, ncclSum, m->handle.ncclComm, @@ -69,10 +76,23 @@ void forward_kernel_wrapper(AllReduceMeta const *m, #endif } -void backward_kernel_wrapper(AllReduceMeta const *m, +void peft_bwd_kernel_wrapper(AllReduceMeta const *m, + BatchConfig const *bc, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { - assert(false && "To be implemented"); + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens(); + size_t data_size = data_type_size(output_grad.data_type); + checkCUDA(cudaMemcpyAsync(input_grad.ptr, + output_grad.ptr, + hidden_dim_size * num_elements * data_size, + cudaMemcpyDeviceToDevice, + stream)); } } // namespace AllReduce diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp index d6e9568223..2a29be1ad4 100644 --- a/src/parallel_ops/kernels/combine_kernels.cpp +++ b/src/parallel_ops/kernels/combine_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/parallel_ops/kernels/combine_kernels.h" +#include "flexflow/parallel_ops/combine.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {} +CombineMeta::CombineMeta(FFHandler handler, Combine const *comb) + : OpMeta(handler, comb) {} namespace Kernels { namespace Combine { diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu index 1ab79a7944..5809e2d4f3 100644 --- a/src/parallel_ops/kernels/combine_kernels.cu +++ b/src/parallel_ops/kernels/combine_kernels.cu @@ -13,12 +13,14 @@ * limitations under the License. */ +#include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/kernels/combine_kernels.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {} +CombineMeta::CombineMeta(FFHandler handler, Combine const *comb) + : OpMeta(handler, comb) {} namespace Kernels { namespace Combine { diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cpp b/src/parallel_ops/kernels/parallel_identity_kernels.cpp new file mode 100644 index 0000000000..8378231fb2 --- /dev/null +++ b/src/parallel_ops/kernels/parallel_identity_kernels.cpp @@ -0,0 +1,97 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/utils/hip_helper.h" +#include + +namespace FlexFlow { + +ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle, + ParallelIdentity const *reduct) + : OpMeta(handle, reduct) {} + +namespace Kernels { +namespace ParallelIdentity { + +void forward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t data_size = data_type_size(input.data_type); + // copy input to output + checkCUDA(hipMemcpyAsync(output.ptr, + input.ptr, + input.domain.get_volume() * data_size, + hipMemcpyDeviceToDevice, + stream)); +} + +void backward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens(); + size_t data_size = data_type_size(input.data_type); + checkCUDA(hipMemcpyAsync(output.ptr, + input.ptr, + hidden_dim_size * num_elements * data_size, + hipMemcpyDeviceToDevice, + stream)); +} + +void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators"); +#endif +} + +} // namespace ParallelIdentity +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cu b/src/parallel_ops/kernels/parallel_identity_kernels.cu new file mode 100644 index 0000000000..6800f3ab16 --- /dev/null +++ b/src/parallel_ops/kernels/parallel_identity_kernels.cu @@ -0,0 +1,96 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/ffconst_utils.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" +#include "flexflow/utils/cuda_helper.h" + +namespace FlexFlow { + +ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle, + ParallelIdentity const *reduct) + : OpMeta(handle, reduct) {} + +namespace Kernels { +namespace ParallelIdentity { + +void forward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t data_size = data_type_size(input.data_type); + // copy input to output + checkCUDA(cudaMemcpyAsync(output.ptr, + input.ptr, + input.domain.get_volume() * data_size, + cudaMemcpyDeviceToDevice, + stream)); +} + +void backward_kernel_wrapper(ParallelIdentityMeta const *m, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + assert(false && "To be implemented"); +} + +void inference_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input.data_type == output.data_type); + assert(input.domain == output.domain); + size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens(); + size_t data_size = data_type_size(input.data_type); + checkCUDA(cudaMemcpyAsync(output.ptr, + input.ptr, + hidden_dim_size * num_elements * data_size, + cudaMemcpyDeviceToDevice, + stream)); +} + +void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, + BatchConfig const *bc, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(input_grad.data_type == output_grad.data_type); + assert(input_grad.domain == output_grad.domain); + size_t hidden_dim_size = + input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + size_t num_elements = bc->num_active_tokens() * hidden_dim_size; +#ifdef FF_USE_NCCL + ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); + checkNCCL(ncclAllReduce(output_grad.ptr, + input_grad.ptr, + num_elements, + nccl_data_type, + ncclSum, + m->handle.ncclComm, + stream)); +#else + assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators"); +#endif +} + +} // namespace ParallelIdentity +} // namespace Kernels +} // namespace FlexFlow diff --git a/src/parallel_ops/kernels/partition_kernels.cpp b/src/parallel_ops/kernels/partition_kernels.cpp index cfd76c0f18..bd1c96d4c7 100644 --- a/src/parallel_ops/kernels/partition_kernels.cpp +++ b/src/parallel_ops/kernels/partition_kernels.cpp @@ -14,12 +14,14 @@ */ #include "flexflow/parallel_ops/kernels/partition_kernels.h" +#include "flexflow/parallel_ops/partition.h" #include "flexflow/utils/hip_helper.h" #include namespace FlexFlow { -RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {} +RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart) + : OpMeta(handler, repart) {} namespace Kernels { namespace Repartition { diff --git a/src/parallel_ops/kernels/partition_kernels.cu b/src/parallel_ops/kernels/partition_kernels.cu index 08008f1035..3a39b39fe4 100644 --- a/src/parallel_ops/kernels/partition_kernels.cu +++ b/src/parallel_ops/kernels/partition_kernels.cu @@ -14,11 +14,13 @@ */ #include "flexflow/parallel_ops/kernels/partition_kernels.h" +#include "flexflow/parallel_ops/partition.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { -RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {} +RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart) + : OpMeta(handler, repart) {} namespace Kernels { namespace Repartition { diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp index 2a3fe5cca1..1f3e8e0962 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cpp +++ b/src/parallel_ops/kernels/reduction_kernels.cpp @@ -20,7 +20,7 @@ namespace FlexFlow { ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace Reduction { diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu index 34ae8007da..df7630976b 100644 --- a/src/parallel_ops/kernels/reduction_kernels.cu +++ b/src/parallel_ops/kernels/reduction_kernels.cu @@ -19,7 +19,7 @@ namespace FlexFlow { ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct) - : OpMeta(handle) {} + : OpMeta(handle, reduct) {} namespace Kernels { namespace Reduction { diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp index 1647f014be..f49e0d4eb0 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cpp +++ b/src/parallel_ops/kernels/replicate_kernels.cpp @@ -20,7 +20,7 @@ namespace FlexFlow { ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) - : OpMeta(handle) {} + : OpMeta(handle, repl) {} namespace Kernels { namespace Replicate { diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu index 35bc109bd3..0b5c434aa6 100644 --- a/src/parallel_ops/kernels/replicate_kernels.cu +++ b/src/parallel_ops/kernels/replicate_kernels.cu @@ -19,7 +19,7 @@ namespace FlexFlow { ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl) - : OpMeta(handle) {} + : OpMeta(handle, repl) {} namespace Kernels { namespace Replicate { diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc new file mode 100644 index 0000000000..883910ae09 --- /dev/null +++ b/src/parallel_ops/parallel_identity.cc @@ -0,0 +1,474 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flexflow/parallel_ops/parallel_identity.h" +#include "flexflow/ffconst_utils.h" +#include "flexflow/model.h" +#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h" +#include "flexflow/utils/hash_utils.h" + +namespace FlexFlow { +// declare Legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::Future; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::LogicalPartition; +using Legion::LogicalRegion; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +using namespace FlexFlow::Kernels::ParallelIdentity; + +/* Params */ +bool operator==(ParallelIdentityParams const &lhs, + ParallelIdentityParams const &rhs) { + return lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim && + std::strcmp(lhs.name, rhs.name) == 0; +} + +bool ParallelIdentityParams::is_valid(ParallelTensorShape const &input) const { + return input.is_valid(); +} + +ParallelIdentityParams ParallelIdentity::get_params() const { + ParallelIdentityParams params; + params.parallel_identity_legion_dim = this->parallel_identity_dim; + if (strlen(this->name) < MAX_OPNAME) { + strcpy(params.name, this->name); + } + return params; +} + +ParallelIdentity::ParallelIdentity(FFModel &model, + const ParallelTensor _input, + int _parallel_identity_legion_dim, + char const *name) + : ParallelOp(model, OP_PARALLEL_IDENTITY, name, _input), + parallel_identity_dim(_parallel_identity_legion_dim) { + int numdim = _input->num_dims; + ParallelDim dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdim; i++) { + dims[i] = _input->dims[i]; + } + assert(dims[parallel_identity_dim].degree > 1); + // ParallelTensorBase::update_parallel_ids(numdim, dims); + outputs[0] = model.create_parallel_tensor_legion_ordering( + numdim, dims, _input->data_type, this); +} + +ParallelIdentity::ParallelIdentity(FFModel &model, + ParallelIdentityParams const ¶ms, + ParallelTensor const input, + char const *name) + : ParallelIdentity( + model, input, params.parallel_identity_legion_dim, params.name) {} + +void ParallelIdentity::create_input_partition(FFModel &ff) { + // Do nothing + return; +} + +void ParallelIdentity::create_input_partition_inference( + FFModel &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs) { + assert(ff.config.computationMode == COMP_MODE_INFERENCE); + assert(batch_outputs[0]->part != LogicalPartition::NO_PART); + assert(batch_inputs[0]->part != LogicalPartition::NO_PART); + // Do nothing + return; +} + +OpMeta *ParallelIdentity::init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ParallelIdentity *ar = (ParallelIdentity *)task->args; + FFHandler handle = *((FFHandler const *)task->local_args); + ParallelIdentityMeta *meta = new ParallelIdentityMeta(handle, ar); + meta->input_type[0] = ar->inputs[0]->data_type; + meta->output_type[0] = ar->outputs[0]->data_type; + assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, ar->name); + return meta; +} + +void ParallelIdentity::init(FFModel const &ff) { + ArgumentMap argmap; + parallel_is = outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_init(ff, argmap); + IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ParallelIdentity)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap(ff, fm); +} + +void ParallelIdentity::forward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + set_argumentmap_for_forward(ff, argmap); + IndexLauncher launcher(PARALLEL_IDENTITY_FWD_TASK_ID, + outputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + outputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + outputs[0]->region)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void ParallelIdentity::forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args); + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + forward_kernel_wrapper(m, input, output); +} + +void ParallelIdentity::backward(FFModel const &ff) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + IndexLauncher launcher(PARALLEL_IDENTITY_BWD_TASK_ID, + inputs[0]->parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + inputs[0]->machine_view.hash()); + launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + runtime->execute_index_space(ctx, launcher); +} + +void ParallelIdentity::backward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args); + + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + backward_kernel_wrapper(m, input_grad, output_grad); +} + +void ParallelIdentity::init_inference( + FFModel const &ff, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + parallel_is = batch_outputs[0]->parallel_is; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID, + parallel_is, + TaskArgument(this, sizeof(ParallelIdentity)), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + FutureMap fm = runtime->execute_index_space(ctx, launcher); + fm.wait_all_results(); + set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); +} + +FutureMap ParallelIdentity::inference( + FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(PARALLEL_IDENTITY_INF_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_inputs[0]->region)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_outputs[0]->region)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void ParallelIdentity::inference_task( + Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_tokens() == 0) { + return; + } + + GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input.data_type == output.data_type); + inference_kernel_wrapper(m, bc, input, output); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ParallelIdentity::save_inference_tensors_to_file( + m, shard_id, bc, {input}, {}, {output}); + } +} + +FutureMap + ParallelIdentity::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + parallel_is = batch_outputs[0]->parallel_is; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + size_t machine_view_hash = + mv ? mv->hash() : batch_outputs[0]->machine_view.hash(); + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + IndexLauncher launcher(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID, + batch_outputs[0]->parallel_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_future(bc); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + WRITE_ONLY, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_outputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + +/*static*/ +void ParallelIdentity::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + + ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args); + BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); + if (bc->num_active_peft_tokens() == 0) { + return; + } + GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( + m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); + GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO( + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + + assert(input_grad.data_type == output_grad.data_type); + peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + if (m->inference_debugging) { + assert(task->index_point.get_dim() == 1); + int shard_id = task->index_point.point_data[0]; + ParallelIdentity::save_inference_tensors_to_file( + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); + } +} + +bool ParallelIdentity::measure_operator_cost(Simulator *sim, + MachineView const &pc, + CostMetrics &cost_metrics) const { + cost_metrics = CostMetrics(); + cost_metrics.forward_time = 0.0f; + cost_metrics.backward_time = 0.0f; + + cost_metrics.sync_time = 0; + cost_metrics.inputs_memory = 0; + cost_metrics.outputs_memory = 0; + cost_metrics.weights_memory = 0; + return true; +} + +bool ParallelIdentity::get_int_parameter(PMParameter para, int *value) const { + switch (para) { + case PM_PARALLEL_IDENTITY_DIM: + *value = parallel_identity_dim; + return true; + default: + return Op::get_int_parameter(para, value); + } +} + +bool ParallelIdentity::append_parallel_op_info( + std::vector ¶llel_ops) const { + ParallelOpInfo ret; + ret.op_type = op_type; + ret.parallel_dim = parallel_identity_dim; + ret.parallel_degree = -1; // ParallelIdentity does not affect parallel degree + parallel_ops.push_back(ret); + return true; +} + +}; // namespace FlexFlow + +namespace std { +size_t hash::operator()( + FlexFlow::ParallelIdentityParams const ¶ms) const { + size_t key = 0; + hash_combine(key, params.parallel_identity_legion_dim); + return key; +} + +} // namespace std diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc index e6ab09d088..fddf739599 100644 --- a/src/parallel_ops/partition.cc +++ b/src/parallel_ops/partition.cc @@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Repartition; /* Params */ bool operator==(RepartitionParams const &lhs, RepartitionParams const &rhs) { return lhs.repartition_legion_dim == rhs.repartition_legion_dim && - lhs.repartition_degree == rhs.repartition_degree; + lhs.repartition_degree == rhs.repartition_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool RepartitionParams::is_valid(ParallelTensorShape const &input) const { @@ -60,7 +61,7 @@ RepartitionParams Repartition::get_params() const { RepartitionParams params; params.repartition_legion_dim = this->repartition_dim; params.repartition_degree = this->repartition_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -200,6 +201,11 @@ void Repartition::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } FutureMap diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc index 5ca2b1301c..7306e04334 100644 --- a/src/parallel_ops/reduction.cc +++ b/src/parallel_ops/reduction.cc @@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::Reduction; /* Params */ bool operator==(ReductionParams const &lhs, ReductionParams const &rhs) { return lhs.reduction_legion_dim == rhs.reduction_legion_dim && - lhs.reduction_degree == rhs.reduction_degree; + lhs.reduction_degree == rhs.reduction_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool ReductionParams::is_valid(ParallelTensorShape const &input) const { @@ -56,7 +57,7 @@ ReductionParams Reduction::get_params() const { ReductionParams params; params.reduction_legion_dim = this->reduction_dim; params.reduction_degree = this->reduction_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -125,6 +126,13 @@ void Reduction::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // output_grad_lp is an aliased partitioning along the replica dim + ff.create_aliased_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + reduction_dim, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } OpMeta *Reduction::init_task(Task const *task, @@ -137,6 +145,7 @@ OpMeta *Reduction::init_task(Task const *task, meta->input_type[0] = reduct->inputs[0]->data_type; meta->output_type[0] = reduct->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, reduct->name); return meta; } @@ -372,6 +381,10 @@ void Reduction::forward_task(Task const *task, GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + if (m->inference_debugging) { + std::cout << "INF " << m->op_name << std::endl; + } + assert(input.data_type == output.data_type); if (input.data_type == DT_HALF) { forward_kernel(input.get_half_ptr(), diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc index ba7bb6677f..38215fc903 100644 --- a/src/parallel_ops/replicate.cc +++ b/src/parallel_ops/replicate.cc @@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Replicate; /* Params */ bool operator==(ReplicateParams const &lhs, ReplicateParams const &rhs) { return lhs.replicate_legion_dim == rhs.replicate_legion_dim && - lhs.replicate_degree == rhs.replicate_degree; + lhs.replicate_degree == rhs.replicate_degree && + std::strcmp(lhs.name, rhs.name) == 0; } bool ReplicateParams::is_valid(ParallelTensorShape const &input) const { @@ -55,7 +56,7 @@ ReplicateParams Replicate::get_params() const { ReplicateParams params; params.replicate_legion_dim = this->replicate_dim; params.replicate_degree = this->replicate_degree; - if (this->name != nullptr) { + if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } return params; @@ -125,6 +126,12 @@ void Replicate::create_input_partition_inference( batch_outputs[0]->parallel_is, batch_inputs[0]->region, inference_input_lps[batch_inputs[0]]); + // output_grad_lp is a disjoint partition + ff.create_disjoint_partition(batch_inputs[0]->num_dims, + batch_inputs[0]->dims, + batch_inputs[0]->parallel_is, + batch_outputs[0]->region_grad, + inference_output_grad_lps[batch_outputs[0]]); } OpMeta *Replicate::init_task(Task const *task, @@ -137,6 +144,7 @@ OpMeta *Replicate::init_task(Task const *task, meta->input_type[0] = repl->inputs[0]->data_type; meta->output_type[0] = repl->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); + std::strcpy(meta->op_name, repl->name); return meta; } @@ -276,6 +284,51 @@ void Replicate::forward(FFModel const &ff) { runtime->execute_index_space(ctx, launcher); } +FutureMap Replicate::peft_bwd(FFModel const &ff, + BatchConfigFuture const &bc, + std::vector const &batch_inputs, + std::vector const &batch_outputs, + MachineView const *mv) { + ArgumentMap argmap; + Context ctx = ff.config.lg_ctx; + Runtime *runtime = ff.config.lg_hlr; + assert(numOutputs == 1); + assert(numInputs == 1); + assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type); + DataType data_type = batch_inputs[0]->data_type; + + // Warning: we need to use batch_inputs[0] here, instead of the usual + // batch_outputs[0] + parallel_is = batch_inputs[0]->parallel_is; + MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view; + + set_argumentmap_for_inference(ff, argmap, batch_outputs[0]); + size_t machine_view_hash = view->hash(); + IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID, + parallel_is, + TaskArgument(NULL, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + machine_view_hash); + launcher.add_region_requirement( + RegionRequirement(inference_output_grad_lps[batch_outputs[0]], + 0 /*projection id*/, + READ_ONLY, + EXCLUSIVE, + batch_outputs[0]->region_grad)); + launcher.add_field(0, FID_DATA); + launcher.add_region_requirement( + RegionRequirement(batch_inputs[0]->part_grad, + 0 /*projection id*/, + READ_WRITE, + EXCLUSIVE, + batch_inputs[0]->region_grad)); + launcher.add_field(1, FID_DATA); + return runtime->execute_index_space(ctx, launcher); +} + void Replicate::backward(FFModel const &ff) { ArgumentMap argmap; Context ctx = ff.config.lg_ctx; @@ -350,6 +403,9 @@ void Replicate::forward_task(Task const *task, assert(task->regions.size() == 2); ReplicateMeta const *m = *((ReplicateMeta **)task->local_args); + if (m->inference_debugging) { + std::cout << "INF " << m->op_name << std::endl; + } Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); @@ -381,6 +437,37 @@ void Replicate::forward_task(Task const *task, } } +void Replicate::peft_bwd_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + assert(regions.size() == 2); + assert(task->regions.size() == 2); + Domain output_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + Domain input_grad_domain = runtime->get_index_space_domain( + ctx, task->regions[1].region.get_index_space()); + // Currently only support the outter most dimension + for (int i = 0; i < output_grad_domain.get_dim() - 1; i++) { + assert(output_grad_domain.lo()[i] == input_grad_domain.lo()[i]); + assert(output_grad_domain.hi()[i] == input_grad_domain.hi()[i]); + } + size_t num_elements = input_grad_domain.get_volume(); + size_t num_replicas = output_grad_domain.get_volume() / num_elements; + float const *output_grad_ptr = helperGetTensorPointerRO( + regions[0], task->regions[0], FID_DATA, ctx, runtime); + float *input_grad_ptr = helperGetTensorPointerRW( + regions[1], task->regions[1], FID_DATA, ctx, runtime); + + ReplicateMeta const *m = *((ReplicateMeta **)task->local_args); + if (m->inference_debugging) { + std::cout << "BWD " << m->op_name << std::endl; + } + + backward_kernel( + output_grad_ptr, input_grad_ptr, num_elements, num_replicas); +} + void Replicate::backward_task(Task const *task, std::vector const ®ions, Context ctx, diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 7989b0799e..4c339750c7 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -25,7 +25,35 @@ Legion::Logger log_bc("BatchConfig"); using Legion::Future; using Legion::Memory; -BatchConfig::BatchConfig() : num_tokens(0) { +void set_optimizer_tasks(OptimizerTasks &tasks, + int max_training_steps, + int completed_training_steps, + int gradient_accumulation_steps) { + assert(max_training_steps > 0); + assert(completed_training_steps >= 0); + assert(gradient_accumulation_steps > 0); + assert(completed_training_steps < max_training_steps); + // Compute gradients should always be true + tasks.compute_gradients = true; + + // Reset gradients to zero in the first iteration and after weight updates + tasks.reset_gradients_to_zero = + (completed_training_steps == 0) || + (completed_training_steps % gradient_accumulation_steps == 0); + + // Update weights every gradient_accumulation_steps + tasks.update_weights = + ((completed_training_steps + 1) % gradient_accumulation_steps == 0); + + // Save updated weights only in the very last training step + tasks.save_updated_weights = + (completed_training_steps == max_training_steps - 1); + if (tasks.save_updated_weights) { + assert(tasks.update_weights); + } +} + +BatchConfig::BatchConfig() : num_tokens(0), num_peft_tokens(0) { for (int i = 0; i < MAX_NUM_REQUESTS; i++) { requestsInfo[i].first_token_depth_in_request = 0; requestsInfo[i].first_token_offset_in_batch = 0; @@ -74,6 +102,14 @@ int BatchConfig::num_active_tokens() const { return num_tokens; } +int BatchConfig::num_active_infr_tokens() const { + return num_tokens; +} + +int BatchConfig::num_active_peft_tokens() const { + return num_peft_tokens; +} + /*static*/ int BatchConfig::max_requests_per_batch() { return RequestManager::get_request_manager()->get_max_requests_per_batch(); @@ -107,8 +143,13 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl; os << "Max sequence length: " << bc.max_sequence_length() << std::endl; // Current values - os << "Number of tokens: " << bc.num_active_tokens() << std::endl; + os << "Number of active tokens: " << bc.num_active_tokens() << std::endl; + os << "Number of inference tokens: " << bc.num_active_infr_tokens() + << std::endl; + os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl; os << "Number of requests: " << bc.num_active_requests() << std::endl; + os << "Number of generation tokens: " << bc.num_generation_tokens + << std::endl; // Per-request info os << "Per-request info:\n"; @@ -121,9 +162,27 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; - os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " BatchConfig Req ID: " + << bc.requestsInfo[i].batch_config_request_id << std::endl; + os << " Prompt phase: " << bc.requestsInfo[i].prompt_phase + << std::endl; + os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; + os << " optimizer_tasks: {" + << "compute_gradients: " << std::boolalpha + << bc.requestsInfo[i].optimizer_tasks.compute_gradients + << ", reset_gradients_to_zero: " + << bc.requestsInfo[i].optimizer_tasks.reset_gradients_to_zero + << ", update_weights: " + << bc.requestsInfo[i].optimizer_tasks.update_weights + << ", save_updated_weights: " + << bc.requestsInfo[i].optimizer_tasks.save_updated_weights << "}" + << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; os << " Request running: " << bc.request_running[i] << std::endl; } diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index 0509c23afe..b10f8e82ab 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -137,6 +137,10 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 57bc5a0458..386a0c940b 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -36,7 +36,8 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { using FlexFlow::get_legion_stream; -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +template +__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } @@ -271,18 +272,10 @@ __host__ void print_beam_tensor(T const *ptr, template <> __host__ void save_tensor(float const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - float *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(float) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(float) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + float *host_ptr = (float *)calloc(num_elements, sizeof(float)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(float) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -293,26 +286,17 @@ __host__ void fprintf(tensor_file, "%.9f", host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template <> __host__ void save_tensor(half const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - half *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(half) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(half) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + half *host_ptr = (half *)calloc(num_elements, sizeof(half)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(half) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -323,27 +307,18 @@ __host__ void fprintf(tensor_file, "%.9f", (float)host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template <> __host__ void save_tensor(int32_t const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - int32_t *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(int32_t) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(int32_t) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + int32_t *host_ptr = (int32_t *)calloc(num_elements, sizeof(int32_t)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(int32_t) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -354,27 +329,18 @@ __host__ void save_tensor(int32_t const *ptr, fprintf(tensor_file, "%d", host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template <> __host__ void save_tensor(int64_t const *ptr, size_t num_elements, char const *file_name) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - int64_t *host_ptr; - checkCUDA(cudaHostAlloc(&host_ptr, - sizeof(int64_t) * num_elements, - cudaHostAllocPortable | cudaHostAllocMapped)); - checkCUDA(cudaMemcpyAsync(host_ptr, - ptr, - sizeof(int64_t) * num_elements, - cudaMemcpyDeviceToHost, - stream)); + int64_t *host_ptr = (int64_t *)calloc(num_elements, sizeof(int64_t)); checkCUDA(cudaDeviceSynchronize()); + checkCUDA(cudaMemcpy( + host_ptr, ptr, sizeof(int64_t) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); assert(tensor_file != NULL); @@ -385,13 +351,12 @@ __host__ void save_tensor(int64_t const *ptr, fprintf(tensor_file, "%ld", host_ptr[i]); } } - fclose(tensor_file); - checkCUDA(cudaFreeHost(host_ptr)); + free(host_ptr); } template -__host__ T *download_tensor(T const *ptr, size_t num_elements) { +__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); T *host_ptr; @@ -404,14 +369,25 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) { } template -__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { +__host__ void + copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); checkCUDA(cudaMemcpyAsync( dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream)); - return true; } + +template +__host__ void + copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(src != nullptr); + checkCUDA(cudaMemcpyAsync( + dst, src, sizeof(T) * num_elements, cudaMemcpyHostToDevice, stream)); +} + cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) { int dims[MAX_TENSOR_DIM]; @@ -609,6 +585,48 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) { return CUDNN_DATA_FLOAT; } +void check_device_vs_host_ptr(void const *maybe_devicePtr) { + cudaPointerAttributes attributes; + cudaError_t cudaStatus = + cudaPointerGetAttributes(&attributes, maybe_devicePtr); + + if (cudaStatus == cudaSuccess) { + // Check attributes and perform actions accordingly + if (attributes.type == cudaMemoryTypeDevice) { + printf("Pointer is allocated in device memory.\n"); + } else if (attributes.type == cudaMemoryTypeHost) { + printf("Pointer is allocated in host memory.\n"); + } else if (attributes.type == cudaMemoryTypeUnregistered) { + printf("Pointer is unregistered.\n"); + } else if (attributes.type == cudaMemoryTypeManaged) { + printf("Pointer is managed.\n"); + } else { + printf("Pointer is not allocated in recognized memory type.\n"); + } + } else { + fprintf(stderr, + "cudaPointerGetAttributes failed: %s\n", + cudaGetErrorString(cudaStatus)); + } +} + +void check_ptr_alignment(void const *ptr) { + if (!ptr) { + printf("Pointer is NULL\n"); + return; + } + bool aligned2 = ((uintptr_t)ptr % 2 == 0); + bool aligned4 = ((uintptr_t)ptr % 4 == 0); + bool aligned8 = ((uintptr_t)ptr % 8 == 0); + bool aligned16 = ((uintptr_t)ptr % 16 == 0); + printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n", + ptr, + (aligned2 ? "yes" : "no"), + (aligned4 ? "yes" : "no"), + (aligned8 ? "yes" : "no"), + (aligned16 ? "yes" : "no")); +} + template __global__ void assign_kernel(half *ptr, coord_t size, half value); template __global__ void @@ -620,6 +638,13 @@ template __global__ void template __global__ void assign_kernel(int64_t *ptr, coord_t size, int64_t value); +template __global__ void + scale_kernel(half *ptr, coord_t size, half a, half b); +template __global__ void + scale_kernel(float *ptr, coord_t size, float a, float b); +template __global__ void + scale_kernel(double *ptr, coord_t size, double a, double b); + template __global__ void add_kernel(half *dst, half const *src, size_t size); template __global__ void @@ -716,26 +741,43 @@ template __host__ void save_tensor(int64_t const *ptr, template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); -template __host__ float *download_tensor(float const *ptr, - size_t num_elements); -template __host__ half *download_tensor(half const *ptr, - size_t num_elements); -template __host__ double *download_tensor(double const *ptr, - size_t num_elements); -template __host__ int32_t *download_tensor(int32_t const *ptr, - size_t num_elements); -template __host__ int64_t *download_tensor(int64_t const *ptr, - size_t num_elements); -template __host__ bool - download_tensor(float const *ptr, float *dst, size_t num_elements); -template __host__ bool - download_tensor(half const *ptr, half *dst, size_t num_elements); -template __host__ bool download_tensor(double const *ptr, - double *dst, - size_t num_elements); -template __host__ bool download_tensor(int32_t const *ptr, - int32_t *dst, - size_t num_elements); -template __host__ bool download_tensor(int64_t const *ptr, - int64_t *dst, - size_t num_elements); +template __host__ float *copy_tensor_dev_to_host(float const *ptr, + size_t num_elements); +template __host__ half *copy_tensor_dev_to_host(half const *ptr, + size_t num_elements); +template __host__ double *copy_tensor_dev_to_host(double const *ptr, + size_t num_elements); +template __host__ int32_t * + copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); +template __host__ int64_t * + copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(float const *ptr, + float *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(half const *ptr, + half *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(double const *ptr, + double *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, + int32_t *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, + int64_t *dst, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, + float const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, + half const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, + double const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, + int32_t const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, + int64_t const *src, + size_t num_elements); diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc index c7b6e1257a..5a7d98b4dc 100644 --- a/src/runtime/ffconst_utils.cc +++ b/src/runtime/ffconst_utils.cc @@ -188,6 +188,9 @@ std::string get_operator_type_name(OperatorType type) { return "Sampling"; case OP_ARGMAX: return "ArgMax"; + // PEFT Ops + case OP_LORA: + return "Lora Layer"; // Parallel Ops case OP_REPARTITION: return "Repartition"; @@ -199,6 +202,8 @@ std::string get_operator_type_name(OperatorType type) { return "Reduction"; case OP_ALLREDUCE: return "AllReduce"; + case OP_PARALLEL_IDENTITY: + return "ParallelIdentity"; case OP_PIPELINE: return "Pipeline"; case OP_FUSED_PARALLEL: diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 819e6527e5..8213726e8a 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -30,4 +30,29 @@ bool operator==(LayerID const &lhs, LayerID const &rhs) { return lhs.id == rhs.id; } +const PEFTModelID PEFTModelID::NO_ID = PEFTModelID(); + +PEFTModelID::PEFTModelID() : id(0) {} + +PEFTModelID::PEFTModelID(size_t _id) : id(_id) { + assert(is_valid_id()); +} + +bool PEFTModelID::is_valid_id() const { + return (id >= PEFT_MODEL_ID_FIRST_VALID && id <= PEFT_MODEL_ID_LAST_VALID); +} + +bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { + return lhs.id == rhs.id; +} + +std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { + if (peft_model_id == PEFTModelID::NO_ID) { + os << "NO_ID"; + } else { + os << peft_model_id.id; + } + return os; +} + }; // namespace FlexFlow diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 43ce9d7005..c373e0da9b 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr, bool final_bias, std::string layer_name, std::string weights_folder) { - std::string q_file = layer_name + "_wq_bias"; - std::string k_file = layer_name + "_wk_bias"; - std::string v_file = layer_name + "_wv_bias"; + std::string q_file = layer_name + ".q_proj.bias"; + std::string k_file = layer_name + ".k_proj.bias"; + std::string v_file = layer_name + ".v_proj.bias"; std::vector bias_files = {q_file, k_file, v_file}; if (final_bias) { - std::string o_file = layer_name + "_wo_bias"; + std::string o_file = layer_name + ".o_proj.bias"; bias_files.push_back(o_file); } @@ -217,12 +217,10 @@ void load_attention_weights_v2(DT *ptr, std::string weights_folder, size_t volume, int tensor_parallelism_degree) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file}; int file_index = 0; @@ -407,12 +405,10 @@ void load_attention_weights_quantized(char *ptr, std::string weights_folder, DataType data_type, bool use_full_precision) { - // layers_0_attention_wq_weight - // layers_0_self_attn_q_proj_weight - std::string q_file = layer_name + "_wq_weight"; - std::string k_file = layer_name + "_wk_weight"; - std::string v_file = layer_name + "_wv_weight"; - std::string o_file = layer_name + "_wo_weight"; + std::string q_file = layer_name + ".q_proj.weight"; + std::string k_file = layer_name + ".k_proj.weight"; + std::string v_file = layer_name + ".v_proj.weight"; + std::string o_file = layer_name + ".o_proj.weight"; std::vector weight_filenames = {q_file, k_file, v_file, o_file}; int file_index = 0; @@ -690,7 +686,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, if (weight_idx > 0) { assert(weight_idx == 0 || weight_idx == 1); if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } } load_from_quantized_file(data, @@ -734,44 +730,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_filename.find("self_attention") != std::string::npos) { - load_attention_weights_multi_query( - data, weight_filename, weights_folder, hidden_dim, num_heads); - } else if (weight_filename.find("attention") != std::string::npos && - weight_filename.rfind("attention") == - weight_filename.length() - strlen("attention")) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); - } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); - } - + if (weight_idx == 0) { + load_attention_weights_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree); } else { - assert(false); + long long value; + l->get_int_property("final_bias", value); + bool final_bias = (bool)value; + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + final_bias, + weight_filename, + weights_folder); } } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); weight_filename += (weight_idx == 0) - ? "_attn_bias" - : ((weight_idx == 1) ? "_weight" : "_bias"); + ? ".attn_bias" + : ((weight_idx == 1) ? ".weight" : ".bias"); std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = join_path({weights_folder, weight_filename}); @@ -781,7 +767,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, assert(weight_idx == 0 || weight_idx == 1); // handle exception if (weight_filename != "embed_tokens_weight_lm_head") { - weight_filename += weight_idx == 0 ? "_weight" : "_bias"; + weight_filename += weight_idx == 0 ? ".weight" : ".bias"; } std::cout << "Loading weight file " << weight_filename << std::endl; std::string weight_filepath = @@ -809,6 +795,10 @@ void FileDataLoader::load_weights(FFModel *ff) { if (weight == NULL) { continue; } + // TODO: currently skip Lora layers + if (l->op_type == OP_LORA) { + continue; + } switch (weight->data_type) { case DT_HALF: load_single_weight_tensor(ff, l, i); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index b023aced6e..1a38782e81 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -36,6 +36,7 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" @@ -54,6 +55,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -1992,6 +1994,7 @@ std::pair, std::unordered_map> mv.device_type = MachineView::GPU; mv.ndims = 1; int total_parallel_degree = 1; + assert(op->numOutputs > 0); for (int i = 0; i < op->outputs[0]->num_dims; i++) { total_parallel_degree *= op->outputs[0]->dims[i].degree; } @@ -2434,6 +2437,13 @@ GraphOptimalViewSerialized sez.serialize(allreduce->name, strlen(allreduce->name)); break; } + case OP_PARALLEL_IDENTITY: { + ParallelIdentity *parallel_identity = (ParallelIdentity *)op; + sez.serialize(parallel_identity->parallel_identity_dim); + sez.serialize(strlen(parallel_identity->name)); + sez.serialize(parallel_identity->name, strlen(parallel_identity->name)); + break; + } case OP_FUSED_PARALLEL: { FusedParallelOp *fused = (FusedParallelOp *)op; sez.serialize(fused->num_parallel_ops); @@ -2475,6 +2485,7 @@ namespace FlexFlow { using PCG::Edge; using PCG::Graph; using PCG::GraphCostResult; +using PCG::log_graph; using PCG::Node; void FFModel::register_all_machine_views( @@ -2759,6 +2770,10 @@ void FFModel::deserialize_graph_optimal_view( node = Linear::deserialize(*this, dez, inputs, num_inputs); break; } + case OP_LORA: { + node = LoraLinear::deserialize(*this, dez, inputs, num_inputs); + break; + } case OP_MULTIHEAD_ATTENTION: { assert(num_inputs == 3); int embed_dim, num_heads, k_dim, v_dim; @@ -3042,8 +3057,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], - {combine_dim, combine_degree}); + CombineParams params; + params.combine_legion_dim = combine_dim; + params.combine_degree = combine_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_REPARTITION: { @@ -3055,8 +3073,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node( - inputs[0], {repartition_dim, repartition_degree}); + RepartitionParams params; + params.repartition_legion_dim = repartition_dim; + params.repartition_degree = repartition_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_REPLICATE: { @@ -3068,8 +3089,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], - {replicate_dim, replicate_degree}); + ReplicateParams params; + params.replicate_legion_dim = replicate_dim; + params.replicate_degree = replicate_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_REDUCTION: { @@ -3081,8 +3105,11 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], - {reduction_dim, reduction_degree}); + ReductionParams params; + params.reduction_legion_dim = reduction_dim; + params.reduction_degree = reduction_degree; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_ALLREDUCE: { @@ -3093,24 +3120,43 @@ void FFModel::deserialize_graph_optimal_view( char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], {allreduce_dim}); + AllReduceParams params; + params.allreduce_legion_dim = allreduce_dim; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(num_inputs == 1); + int parallel_identity_dim; + dez.deserialize(parallel_identity_dim); + size_t name_len; + char name[MAX_OPNAME] = {0}; + dez.deserialize(name_len); + dez.deserialize(name, name_len); + ParallelIdentityParams params; + params.parallel_identity_legion_dim = parallel_identity_dim; + strcpy(params.name, name); + node = get_or_create_node(inputs[0], params); break; } case OP_FUSED_PARALLEL: { assert(num_inputs == 1); - std::vector parallel_ops; + FusedParallelOpParams params; int num_parallel_ops; dez.deserialize(num_parallel_ops); for (int i = 0; i < num_parallel_ops; i++) { ParallelOpInfo info; dez.deserialize(info); - parallel_ops.push_back(info); + params.parallel_ops.push_back(info); } size_t name_len; char name[MAX_OPNAME] = {0}; dez.deserialize(name_len); dez.deserialize(name, name_len); - node = get_or_create_node(inputs[0], {parallel_ops}); + strcpy(params.name, name); + + node = get_or_create_node(inputs[0], params); break; } default: { @@ -3149,20 +3195,20 @@ void FFModel::deserialize_graph_optimal_view( optimal_views[guid_to_nodes[guid]] = view; } assert(dez.get_remaining_bytes() == 0); - printf("Deserialized Views...\n"); + log_graph.debug("Deserialized Views...\n"); for (auto const &it : optimal_views) { - printf("node[%zu]: type(%s) view(%d %d %d) ", - it.first.guid, - it.first.to_string().c_str(), - it.second.ndims, - it.second.dim[0], - it.second.start_device_id); + log_graph.debug("node[%zu]: type(%s) view(%d %d %d) ", + it.first.guid, + it.first.to_string().c_str(), + it.second.ndims, + it.second.dim[0], + it.second.start_device_id); auto const &list = graph->inEdges.at(it.first); for (auto const &it2 : list) { Edge e = it2; - printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); + log_graph.debug(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx); } - printf("\n"); + log_graph.debug("\n"); } } diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp index 613df1cbcf..057be8f443 100644 --- a/src/runtime/hip_helper.cpp +++ b/src/runtime/hip_helper.cpp @@ -29,7 +29,8 @@ hipError_t get_legion_stream(hipStream_t *stream) { using FlexFlow::get_legion_stream; -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +template +__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } @@ -55,6 +56,14 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { } } +template +__global__ void + copy_kernel_discrete(DT *dst, const DT *src, coord_t size, size_t *index) { + CUDA_KERNEL_LOOP(i, size) { + dst[i] = src[index[i]]; + } +} + template __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) { CUDA_KERNEL_LOOP(i, n) { @@ -224,22 +233,24 @@ __host__ void updateGAS(float *para_ptr, } template -__host__ void - print_tensor(T const *ptr, size_t num_elements, char const *prefix) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ void print_tensor(T const *ptr, + size_t num_elements, + char const *prefix, + int shard_id) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); T *host_ptr; - checkCUDA(hipHostMalloc((void **)&host_ptr, + checkCUDA(hipHostMalloc(&host_ptr, sizeof(T) * num_elements, hipHostMallocPortable | hipHostMallocMapped)); - checkCUDA(hipMemcpy( - host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost)); - // checkCUDA(hipDeviceSynchronize()); + checkCUDA(hipMemcpyAsync( + host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); + checkCUDA(hipDeviceSynchronize()); int idx = 0; - printf("%s", prefix); + printf("%s, %d---->", prefix, shard_id); for (idx = 0; idx < num_elements; idx++) { - printf(" %.4lf", (float)host_ptr[idx]); - if (idx >= 16) { + printf(" %.20lf", (float)host_ptr[idx]); + if (idx >= 100) { break; } } @@ -247,6 +258,40 @@ __host__ void checkCUDA(hipHostFree(host_ptr)); } +template +__host__ void print_beam_tensor(T const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + T *host_ptr; + checkCUDA(hipHostMalloc(&host_ptr, + sizeof(T) * channel * skip, + hipHostMallocPortable | hipHostMallocMapped)); + checkCUDA(hipMemcpyAsync(host_ptr, + ptr, + sizeof(T) * channel * skip, + hipMemcpyDeviceToHost, + stream)); + // checkCUDA(hipDeviceSynchronize()); + int idx = 0; + printf("%s", prefix); + + for (int i = 0; i < channel; i += 1) { + for (idx = 0; idx < num_elements; idx++) { + printf(" %.20lf", (float)host_ptr[idx + i * skip]); + if (idx >= 100) { + break; + } + } + printf("\n-----***********------\n"); + } + + checkCUDA(hipHostFree(host_ptr)); +} + template <> __host__ void save_tensor(float const *ptr, size_t num_elements, char const *file_name) { @@ -370,9 +415,7 @@ __host__ void save_tensor(int64_t const *ptr, } template -__host__ T *download_tensor(T const *ptr, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); T *host_ptr; @@ -381,21 +424,27 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) { hipHostMallocPortable | hipHostMallocMapped)); checkCUDA(hipMemcpyAsync( host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); - // checkCUDA(hipDeviceSynchronize()); return host_ptr; } template -__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) { - // device synchronize to make sure the data are ready - // checkCUDA(hipDeviceSynchronize()); +__host__ void + copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); assert(dst != nullptr); checkCUDA(hipMemcpyAsync( dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream)); - // checkCUDA(hipDeviceSynchronize()); - return true; +} + +template +__host__ void + copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) { + hipStream_t stream; + checkCUDA(get_legion_stream(&stream)); + assert(src != nullptr); + checkCUDA(hipMemcpyAsync( + dst, src, sizeof(T) * num_elements, hipMemcpyHostToDevice, stream)); } miopenStatus_t cudnnSetTensorDescriptorFromDomain( @@ -450,22 +499,23 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain( return miopenStatusBadParm; } -miopenStatus_t - cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor, - Domain domain) { +miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax( + miopenTensorDescriptor_t tensor, Domain domain, DataType data_type) { int dims[MAX_TENSOR_DIM]; + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type); switch (domain.get_dim()) { case 1: { Rect<1> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; - return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1); + return miopenSet4dTensorDescriptor( + tensor, cudnn_data_type, dims[0], 1, 1, 1); } case 2: { Rect<2> rect = domain; dims[0] = rect.hi[0] - rect.lo[0] + 1; dims[1] = rect.hi[1] - rect.lo[1] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[1], dims[0], 1, 1); + tensor, cudnn_data_type, dims[1], dims[0], 1, 1); } case 3: { Rect<3> rect = domain; @@ -473,7 +523,7 @@ miopenStatus_t dims[1] = rect.hi[1] - rect.lo[1] + 1; dims[2] = rect.hi[2] - rect.lo[2] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1); + tensor, cudnn_data_type, dims[2] * dims[1], dims[0], 1, 1); } case 4: { Rect<4> rect = domain; @@ -482,7 +532,7 @@ miopenStatus_t dims[2] = rect.hi[2] - rect.lo[2] + 1; dims[3] = rect.hi[3] - rect.lo[3] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1); + tensor, cudnn_data_type, dims[3] * dims[2] * dims[1], dims[0], 1, 1); } case 5: { Rect<5> rect = domain; @@ -493,7 +543,7 @@ miopenStatus_t dims[2] = rect.hi[2] - rect.lo[2] + 1; dims[3] = rect.hi[3] - rect.lo[3] + 1; return miopenSet4dTensorDescriptor( - tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]); + tensor, cudnn_data_type, dims[3], dims[2], dims[1], dims[0]); } default: assert(false && "Unsupported dim number"); @@ -553,6 +603,49 @@ void handle_unimplemented_hip_kernel(OperatorType op_type) { throw std::runtime_error("Unimplemented hip kernel for Operator: " + FlexFlow::get_operator_type_name(op_type)); } +void check_device_vs_host_ptr(void const *maybe_devicePtr) { + hipPointerAttribute_t attributes; + hipError_t hipStatus = hipPointerGetAttributes(&attributes, maybe_devicePtr); + + if (hipStatus == hipSuccess) { + // Check attributes and perform actions accordingly + if (attributes.memoryType == hipMemoryTypeDevice) { + printf("Pointer is allocated in device memory.\n"); + } else if (attributes.memoryType == hipMemoryTypeHost) { + printf("Pointer is allocated in host memory.\n"); + } else if (attributes.memoryType == hipMemoryTypeArray) { + printf("Pointer points to array memory, physically located on device.\n"); + } else if (attributes.memoryType == hipMemoryTypeManaged) { + printf("Pointer points to managed memory, automaticallly managed by the " + "unified memory system.\n"); + } else if (attributes.memoryType == hipMemoryTypeUnified) { + printf("Pointer points to unified memory (not supported currently) \n"); + } else { + printf("Pointer is not allocated in recognized memory type.\n"); + } + } else { + fprintf(stderr, + "hipPointerGetAttributes failed: %s\n", + hipGetErrorString(hipStatus)); + } +} + +void check_ptr_alignment(void const *ptr) { + if (!ptr) { + printf("Pointer is NULL\n"); + return; + } + bool aligned2 = ((uintptr_t)ptr % 2 == 0); + bool aligned4 = ((uintptr_t)ptr % 4 == 0); + bool aligned8 = ((uintptr_t)ptr % 8 == 0); + bool aligned16 = ((uintptr_t)ptr % 16 == 0); + printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n", + ptr, + (aligned2 ? "yes" : "no"), + (aligned4 ? "yes" : "no"), + (aligned8 ? "yes" : "no"), + (aligned16 ? "yes" : "no")); +} template __global__ void assign_kernel(half *ptr, coord_t size, half value); @@ -565,6 +658,13 @@ template __global__ void template __global__ void assign_kernel(int64_t *ptr, coord_t size, int64_t value); +template __global__ void + scale_kernel(half *ptr, coord_t size, half a, half b); +template __global__ void + scale_kernel(float *ptr, coord_t size, float a, float b); +template __global__ void + scale_kernel(double *ptr, coord_t size, double a, double b); + template __global__ void add_kernel(half *dst, half const *src, size_t size); template __global__ void @@ -587,6 +687,15 @@ template __global__ void template __global__ void copy_kernel(int64_t *dst, int64_t const *src, coord_t size); +template __global__ void copy_kernel_discrete(float *dst, + float const *src, + coord_t size, + size_t *index); +template __global__ void copy_kernel_discrete(int64_t *dst, + int64_t const *src, + coord_t size, + size_t *index); + template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, size_t size, @@ -604,16 +713,42 @@ template __global__ void apply_add_with_scale(int64_t *data_ptr, size_t size, int64_t scale); -template __host__ void - print_tensor(float const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(double const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(int32_t const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(int64_t const *ptr, size_t rect, char const *prefix); -template __host__ void - print_tensor(half const *ptr, size_t rect, char const *prefix); +template __host__ void print_tensor(float const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(double const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(int32_t const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(int64_t const *ptr, + size_t rect, + char const *prefix, + int shard_id); +template __host__ void print_tensor(half const *ptr, + size_t rect, + char const *prefix, + int shard_id); + +template __host__ void print_beam_tensor(float const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); +template __host__ void print_beam_tensor(int32_t const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); +template __host__ void print_beam_tensor(int64_t const *ptr, + size_t num_elements, + int skip, + int channel, + char const *prefix); template __host__ void save_tensor(float const *ptr, size_t rect, char const *file_name); @@ -626,24 +761,43 @@ template __host__ void save_tensor(int64_t const *ptr, template __host__ void save_tensor(half const *ptr, size_t rect, char const *file_name); -template __host__ float *download_tensor(float const *ptr, - size_t num_elements); -template __host__ half *download_tensor(half const *ptr, - size_t num_elements); -template __host__ double *download_tensor(double const *ptr, - size_t num_elements); -template __host__ int32_t *download_tensor(int32_t const *ptr, - size_t num_elements); -template __host__ int64_t *download_tensor(int64_t const *ptr, - size_t num_elements); -template __host__ bool - download_tensor(float const *ptr, float *dst, size_t num_elements); -template __host__ bool download_tensor(double const *ptr, - double *dst, - size_t num_elements); -template __host__ bool download_tensor(int32_t const *ptr, - int32_t *dst, - size_t num_elements); -template __host__ bool download_tensor(int64_t const *ptr, - int64_t *dst, - size_t num_elements); +template __host__ float *copy_tensor_dev_to_host(float const *ptr, + size_t num_elements); +template __host__ half *copy_tensor_dev_to_host(half const *ptr, + size_t num_elements); +template __host__ double *copy_tensor_dev_to_host(double const *ptr, + size_t num_elements); +template __host__ int32_t * + copy_tensor_dev_to_host(int32_t const *ptr, size_t num_elements); +template __host__ int64_t * + copy_tensor_dev_to_host(int64_t const *ptr, size_t num_elements); +template __host__ void copy_tensor_dev_to_host(float const *ptr, + float *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(half const *ptr, + half *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(double const *ptr, + double *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int32_t const *ptr, + int32_t *dst, + size_t num_elements); +template __host__ void copy_tensor_dev_to_host(int64_t const *ptr, + int64_t *dst, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(float *dst, + float const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(half *dst, + half const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(double *dst, + double const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int32_t *dst, + int32_t const *src, + size_t num_elements); +template __host__ void copy_tensor_host_to_dev(int64_t *dst, + int64_t const *src, + size_t num_elements); diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 3d299aeedd..1b65dfd869 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -54,10 +54,31 @@ bool parallel_tensor_list_overlaps(std::vector const &list1, } void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { + + // Check if the model object exists + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl; + } + // TODO: currently assume there is a single data-parallel pipeline // (i.e., data-parallel-degree == 1) assert(model->config.data_parallelism_degree == 1); model->config.batchSize = BatchConfig::max_tokens_per_batch(); + + // Check if the model object exists after importing config + if (model == nullptr) { + std::cout << "###PEFT DEBUGGING### Model object does not exist after " + "setting config and batch size." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl; + } + model->compile_inference(); Context ctx = model->config.lg_ctx; Runtime *runtime = model->config.lg_hlr; @@ -117,7 +138,28 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { for (int i = 0; i < op->numOutputs; i++) { ParallelTensor pt_base = op->outputs[i]; assert(tensor_buffer.find(pt_base) == tensor_buffer.end()); - + // no need to map inplace tensor + // A tensor is inplace if it shares the same region as another tensor + { + bool inplace = false; + for (int j = 0; j < op->numInputs; j++) { + if (op->inputs[j]->region == op->outputs[i]->region) { + assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); + tensor_buffer[pt_base] = tensor_buffer[op->inputs[j]]; + inplace = true; + } + } + for (int j = 0; j < i; j++) { + if (op->outputs[j]->region == op->outputs[i]->region) { + assert(tensor_buffer.find(op->outputs[j]) != tensor_buffer.end()); + tensor_buffer[pt_base] = tensor_buffer[op->outputs[j]]; + inplace = true; + } + } + if (inplace) { + continue; + } + } if (op->op_type == OP_REPLICATE) { assert(op->numInputs == 1 && op->numOutputs == 1); } @@ -175,7 +217,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } } if (!found_parallel_tensor) { - log_offload.print( + log_offload.debug( "Cannot find a previous tensor for operator(%d) output_idx(%d)", op_idx, i); @@ -191,6 +233,13 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { pt_base->region.get_field_space()); pt->part = runtime->get_logical_partition( ctx, pt->region, pt_base->part.get_index_partition()); + + pt->region_grad = + runtime->create_logical_region(ctx, + pt_base->region.get_index_space(), + pt_base->region.get_field_space()); + pt->part_grad = runtime->get_logical_partition( + ctx, pt->region_grad, pt_base->part.get_index_partition()); pt->machine_view = machine_views[j]; // std::cout << "output mv: " << pt->machine_view << std::endl; Domain part_domain = @@ -205,6 +254,30 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { // std::cout << std::endl; } + // Check whether we need to reset input grads + // We use a parallel tensor's region as the key + std::set reset_inputs; + for (int l = model->operators.size() - 1; l >= 0; l--) { + Op *op = model->operators[l]; + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i]->region != LogicalRegion::NO_REGION); + if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) { + // We should not reset input grads since other operators have already + // saved gradients into the region + op->reset_input_grads[i] = false; + } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM || + op->op_type == OP_RESIDUAL_RMS_NORM || + op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) { + if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) { + op->reset_input_grads[0] = false; + } + reset_inputs.insert(op->inputs[i]->region); + } else { + reset_inputs.insert(op->inputs[i]->region); + } + } + } + // Perform fusion optimizations if (model->config.perform_fusion) { fprintf(stderr, "Applying fusion optimizations during compilation...\n"); @@ -235,34 +308,35 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) { continue; } - printf("operator[%zu]: type(%s) guid(%lu)\n", - i, - get_operator_type_name(model->operators[i]->op_type).c_str(), - model->operators[i]->op_guid); + log_inf_mgr.debug( + "operator[%zu]: type(%s) guid(%lu)\n", + i, + get_operator_type_name(model->operators[i]->op_type).c_str(), + model->operators[i]->op_guid); for (int j = 0; j < op->numInputs; j++) { assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end()); LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region; - printf("\tinputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\tinputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } for (int j = 0; j < op->numOutputs; j++) { LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region; - printf("\toutputs[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\toutputs[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } for (int j = 0; j < op->numWeights; j++) { LogicalRegion handle = op->weights[j]->region; - printf("\tweights[%d] mapped_region(%d,%d,%d)\n", - j, - handle.get_index_space().get_id(), - handle.get_field_space().get_id(), - handle.get_tree_id()); + log_inf_mgr.debug("\tweights[%d] mapped_region(%d,%d,%d)\n", + j, + handle.get_index_space().get_id(), + handle.get_field_space().get_id(), + handle.get_tree_id()); } } } @@ -290,9 +364,9 @@ void InferenceManager::init_operators_inference(FFModel *model) { assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); assert(tensor_buffer[op->outputs[i]].size() > batch_index); outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; - if (i > 0) { - assert(outputs[0]->machine_view == outputs[i]->machine_view); - } + // if (i > 0) { + // assert(outputs[0]->machine_view == outputs[i]->machine_view); + // } assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); } if (op->is_parallel_op()) { @@ -332,11 +406,12 @@ FutureMap InferenceManager::inference(FFModel *model, FutureMap InferenceManager::inference(FFModel *model, int index, BatchConfigFuture const &bc) { - // log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)", + // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d) + // num_active_requests(%d)", // bc.get_mode(), - // bc.num_active_tokens(), + // bc.num_active_infr_tokens(), // bc.num_active_requests()); - // assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0); + // assert(bc.num_active_infr_tokens() > 0 && bc.num_active_requests() > 0); // We currently assume that the index-th batch will be placed // on the device_index-th device (except for the experts layers) int batch_index = index % model->config.data_parallelism_degree; @@ -390,6 +465,53 @@ FutureMap InferenceManager::inference(FFModel *model, return fm; }; +void InferenceManager::peft_bwd(FFModel *model, + int index, + BatchConfigFuture const &bc) { + int batch_index = index % model->config.data_parallelism_degree; + FutureMap fm; + bool found_input_operator = false; + int last_op = model->operators.size() - 1; + // Assert that the last operator must be argmax or sampling + assert(model->operators[last_op]->op_type == OP_ARGMAX || + model->operators[last_op]->op_type == OP_ARG_TOPK || + model->operators[last_op]->op_type == OP_SAMPLING); + last_op -= 1; + while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) { + last_op -= 1; + } + for (int o = last_op; o >= 0; o--) { + Op *op = model->operators[o]; + if (op->op_type == OP_WEIGHT) { + continue; + } + if (op->op_type == OP_INPUT) { + continue; + } + std::vector inputs(op->numInputs); + std::vector outputs(op->numOutputs); + for (int i = 0; i < op->numInputs; i++) { + assert(op->inputs[i] != nullptr); + assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE); + assert(tensor_buffer[op->inputs[i]].size() > batch_index); + inputs[i] = tensor_buffer[op->inputs[i]][batch_index]; + assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + for (int i = 0; i < op->numOutputs; i++) { + assert(op->outputs[i] != nullptr); + assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE); + if (op->op_type == OP_INPUT && + tensor_buffer[op->outputs[i]].size() == 0) { + continue; + } + assert(tensor_buffer[op->outputs[i]].size() > batch_index); + outputs[i] = tensor_buffer[op->outputs[i]][batch_index]; + assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE); + } + op->peft_bwd(*model, bc, inputs, outputs); + } +}; + void InferenceManager::load_input_tokens_from_batch_config( FFModel *model, BatchConfigFuture const &bc, @@ -509,17 +631,26 @@ void FFModel::set_position_offset(int offset) { } void FFModel::compile_inference() { + std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl; + // Request at least four CPU processors for inference runs assert( config.cpusPerNode >= 4 && "FlexFlow Serve requires at least four CPU cores per node, please add " "`-ll:cpu 4` in the command line if you are using the C++ interface or " "set `num_cpus` in `ff.init` if you are using the Python interface"); + + std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four " + "CPU cores per node." + << std::endl; Context ctx = config.lg_ctx; Runtime *runtime = config.lg_hlr; config.computationMode = COMP_MODE_INFERENCE; create_operators_from_layers(); + // Launch the graph optimize task + std::cout << "###PEFT DEBUGGING### Launching graph optimization task." + << std::endl; { FFModel *model = this; TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID, @@ -535,7 +666,7 @@ void FFModel::compile_inference() { deserialize_graph_optimal_view(dez, best_graph, optimal_views); operators.clear(); convert_graph_to_operators(best_graph, optimal_views); - best_graph->print_dot(); + // best_graph->print_dot(); delete best_graph; for (auto const &layer : layers) { // map inputs to parallel tensor @@ -570,6 +701,14 @@ void FFModel::compile_inference() { } } } + + std::cout + << "###PEFT DEBUGGING### Operators reconstructed from optimized graph." + << std::endl; + // Perform inplace optimizations + std::cout << "###PEFT DEBUGGING### Starting inplace optimizations." + << std::endl; + loss_op = nullptr; metrics_op = nullptr; // Perform inplace optimizations @@ -609,6 +748,8 @@ void FFModel::compile_inference() { } } + // Output tensor mapping + std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl; for (size_t l = 0; l < operators.size(); l++) { Op *op = operators[l]; @@ -634,11 +775,14 @@ void FFModel::compile_inference() { } #ifdef FF_USE_NCCL + std::cout << "###PEFT DEBUGGING### Setting up NCCL communications." + << std::endl; for (size_t l = 0; l < operators.size(); l++) { // Only create nccl for allreduce and fusedop for inference // (fusedop may include allreduces) if (operators[l]->op_type == OP_ALLREDUCE || - operators[l]->op_type == OP_FUSED) { + operators[l]->op_type == OP_PARALLEL_IDENTITY || + operators[l]->op_type == OP_LORA || operators[l]->op_type == OP_FUSED) { MachineView view = operators[l]->outputs[0]->machine_view; if (view_hash_to_nccl_comms.find(view.hash()) == view_hash_to_nccl_comms.end()) { @@ -670,6 +814,8 @@ void FFModel::compile_inference() { } } #endif + std::cout << "###PEFT DEBUGGING### compile_inference completed successfully." + << std::endl; } std::string join_path(std::vector const &paths) { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 4c67de1aa9..f46630db3c 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -47,6 +47,7 @@ #include "flexflow/ops/inc_multihead_self_attention.h" #include "flexflow/ops/layer_norm.h" #include "flexflow/ops/linear.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/ops/noop.h" #include "flexflow/ops/pool_2d.h" #include "flexflow/ops/reduce.h" @@ -66,6 +67,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -77,6 +79,7 @@ #include #include #include +#include namespace FlexFlow { @@ -135,19 +138,21 @@ Op::Op(FFModel &model, std::string pcname; if (_name == NULL) { pcname = get_operator_type_name(op_type); + pcname = pcname + "_" + std::to_string(op_guid); } else { pcname = std::string(_name); } - pcname = pcname + "_" + std::to_string(op_guid); assert(pcname.length() < MAX_OPNAME); + // std::cout << "Creating operator: " << pcname << std::endl; std::strcpy(name, pcname.c_str()); + // std::cout << "copied name into name var: " << this->name << std::endl; for (int i = 0; i < numInputs; i++) { assert(tensors[i] != NULL); inputs[i] = tensors[i]; } for (int i = 0; i < numInputs; i++) { - trainableInputs[i] = true; - // resetInputGrads[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_OUTPUTS; i++) { outputs[i] = nullptr; @@ -191,8 +196,8 @@ Op::Op(FFModel &model, } } for (int i = 0; i < numInputs; i++) { - trainableInputs[i] = true; - // resetInputGrads[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_OUTPUTS; i++) { outputs[i] = NULL; @@ -1245,7 +1250,8 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff, int idx = 0; \ for (PointInRectIterator it(rect); it(); it++) { \ FFHandler handle = ff.handlers[view.get_device_id(*it)]; \ - if (op_type == OP_ALLREDUCE) { \ + if (op_type == OP_ALLREDUCE || op_type == OP_LORA || \ + op_type == OP_PARALLEL_IDENTITY) { \ ncclComm_t *nccl_comms = ff.find_nccl_comms(view); \ handle.ncclComm = nccl_comms[idx++]; \ } \ @@ -1475,10 +1481,12 @@ bool Op::get_weight_parameter(TNParameter tnp, return true; } +#ifdef DEADCODE OpMeta::OpMeta(FFHandler _handle) : handle(_handle), profiling(false), inference_debugging(false) { for (int i = 0; i < MAX_NUM_INPUTS; i++) { - trainableInputs[i] = true; + trainable_inputs[i] = true; + reset_input_grads[i] = true; } for (int i = 0; i < MAX_NUM_INPUTS; i++) { input_type[i] = DT_NONE; @@ -1490,9 +1498,17 @@ OpMeta::OpMeta(FFHandler _handle) output_type[i] = DT_NONE; } decoding_step = 0; + bwd_step = 0; } +#endif -OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { +OpMeta::OpMeta(FFHandler _handle, Op const *op) + : handle(_handle), profiling(op->profiling), + inference_debugging(op->inference_debugging) { + for (int i = 0; i < op->numInputs; i++) { + trainable_inputs[i] = op->trainable_inputs[i]; + reset_input_grads[i] = op->reset_input_grads[i]; + } for (int i = 0; i < op->numInputs; i++) { input_type[i] = op->inputs[i]->data_type; } @@ -1503,6 +1519,7 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) { output_type[i] = op->outputs[i]->data_type; } decoding_step = 0; + bwd_step = 0; } FFRuntime::FFRuntime(FFConfig &config) { @@ -1520,6 +1537,10 @@ FFRuntime::FFRuntime(FFConfig &config) { info.workSpaceSize = config.workSpaceSize; info.offload_reserve_space_size = config.cpu_offload ? config.offload_reserve_space_size : 0; + info.peft_activation_reserve_space_size = + config.enable_peft ? config.peft_activation_reserve_space_size : 0; + info.peft_weight_reserve_space_size = + config.enable_peft ? config.peft_weight_reserve_space_size : 0; info.quantization_type = config.quantization_type; info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion; argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo))); @@ -1546,9 +1567,32 @@ FFRuntime *ffruntime_singleton = nullptr; int FFModel::model_counter = 0; +void make_debug_dirs() { + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + std::string debug_dir_ = + ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow" + : std::string("~/.cache/flexflow/debug/flexflow"); + wordexp_t p; + wordexp(debug_dir_.c_str(), &p, 0); + debug_dir_ = p.we_wordv[0]; + wordfree(&p); + fs::path debug_dir = debug_dir_; + if (fs::exists(debug_dir)) { + fs::remove_all(debug_dir); + } + fs::create_directories(debug_dir); + assert(fs::is_directory(debug_dir)); + std::vector debug_subdirs = {"fwd", "bwd", "optim", "weights"}; + for (auto const &subdir : debug_subdirs) { + fs::path subdir_path = debug_dir / subdir; + fs::create_directory(subdir_path); + } +} + FFModel::FFModel(FFConfig &_config, bool cpu_offload) : op_global_guid(OP_GUID_FIRST_VALID), layer_global_guid(LAYER_GUID_FIRST_VALID), + peft_model_global_guid(PEFT_MODEL_ID_FIRST_VALID), tensor_global_guid(TENSOR_GUID_FIRST_VALID), parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID), node_global_guid(NODE_GUID_FIRST_VALID), current_transformer_layer_id(0), @@ -1586,6 +1630,9 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) for (int idx = 0; idx < config.workersPerNode * config.numNodes; idx++) { handlers[idx] = ffruntime_singleton->handlers[idx]; } + if (config.inference_debugging) { + make_debug_dirs(); + } model_id = model_counter++; } @@ -2932,7 +2979,8 @@ bool FFModel::apply_fusion( // don't fuse parallel op except allReduce since they have different // parallel_is in forward/backward if (operators[l]->is_parallel_op() && - operators[l]->op_type != OP_ALLREDUCE) { + operators[l]->op_type != OP_ALLREDUCE && + operators[l]->op_type != OP_PARALLEL_IDENTITY) { continue; } size_t start = 0; @@ -2978,7 +3026,8 @@ bool FFModel::apply_fusion( // don't fuse parallel op except allReduce since they have different // parallel_is in forward/backward if (operators[i]->is_parallel_op() && - operators[i]->op_type != OP_ALLREDUCE) { + operators[i]->op_type != OP_ALLREDUCE && + operators[i]->op_type != OP_PARALLEL_IDENTITY) { continue; } fused_op = new FusedOp(*this, operators[i]); @@ -3010,8 +3059,19 @@ bool FFModel::apply_fusion( found = k; } } - assert(found >= 0); - op->inputs[idx] = fused_op->outputs[found]; + if (found >= 0) { + op->inputs[idx] = fused_op->outputs[found]; + } else { + for (int k = 0; k < fused_op->numInputs; k++) { + if (fused_op->inputs[k]->region == + op->inputs[idx]->region) { + assert(found == -1); + found = k; + } + } + assert(found >= 0); + op->inputs[idx] = fused_op->inputs[found]; + } } } // Insert op @@ -3287,6 +3347,12 @@ Op *FFModel::create_operator_from_layer( operators.push_back(op); return op; } + // PEFT layers + case OP_LORA: { + Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs); + operators.push_back(op); + return op; + } default: assert(false); } @@ -3313,9 +3379,123 @@ bool FFModel::is_mlp_block(int layer_idx) const { return false; } +bool FFModel::need_to_add_combine(int layer_idx) const { + if (config.computationMode != COMP_MODE_INFERENCE || + config.tensor_parallelism_degree == 1 || layers.size() <= 2) { + return false; + } + auto const &l = layers[layer_idx]; + // softmax followed by argmax/arg_topk: add combine before softmax + if (layer_idx == layers.size() - 2) { + auto const &l_next = layers[layer_idx + 1]; + if (l->op_type == OP_SOFTMAX && + (l_next->op_type == OP_ARG_TOPK || l_next->op_type == OP_ARGMAX)) { + return true; + } else { + return false; + } + } + // argmax/arg_topk not precedent by softmax: add combine before + // argmax/arg_topk + if (layer_idx == layers.size() - 1 && + (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX)) { + auto const &l_prev = layers[layer_idx - 1]; + if (l_prev->op_type == OP_SOFTMAX) { + return false; + } + return true; + } + return false; +} + +bool FFModel::need_to_add_allreduce(int layer_idx) const { + auto const &l = layers[layer_idx]; + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + // mlp layer + is_mlp_block(layer_idx) || + // llama mlp layer + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR) || + // LLAMA with element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 3 && + layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && + layers[layer_idx - 2]->op_type == OP_LINEAR && + layers[layer_idx - 3]->op_type == OP_LINEAR))) { + return true; + } + return false; +} + +#ifdef DEADCODE +bool FFModel::need_to_add_parallel_identity(int layer_idx) const { + auto const &l = layers[layer_idx]; + // add parallel identity (allreduce in the backward pass) before the lm head + // we find the lm head by looking for the linear layer right after a residual + // rms norm / layer norm, and before a softmax, followed by + // argmax/argtopk/sampling + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + ((l->op_type == OP_RESIDUAL_RMS_NORM || + l->op_type == OP_RESIDUAL_LAYERNORM) && + // there are at least 2 layers before the norm, and at least 3 following + // the norm + layer_idx >= 2 && layer_idx < layers.size() - 3 && + // norm is followed by linear layer (lm head) + layers[layer_idx + 1]->op_type == OP_LINEAR && + // lm head is followed by softmax + layers[layer_idx + 2]->op_type == OP_SOFTMAX && + // softmax is followed by argmax/argtopk/sampling + (layers[layer_idx + 3]->op_type == OP_ARG_TOPK || + layers[layer_idx + 3]->op_type == OP_SAMPLING || + layers[layer_idx + 3]->op_type == OP_ARGMAX || + layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) { + return true; + } + return false; +} +#endif +bool FFModel::need_to_add_parallel_identity(int layer_idx) const { + auto const &l = layers[layer_idx]; + // add parallel identity (allreduce in the backward pass) before the lm head + // we find the lm head by looking for the linear layer right after a residual + // rms norm / layer norm, and before a softmax, followed by + // argmax/argtopk/sampling + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + ((l->op_type == OP_RMS_NORM || l->op_type == OP_RESIDUAL_RMS_NORM || + l->op_type == OP_LAYERNORM || l->op_type == OP_RESIDUAL_LAYERNORM) && + // there are at least 2 layers before the norm, and at least 1 following + // the norm + layer_idx >= 2 && layer_idx < layers.size() - 1 && + // norm is followed by linear layer or attention + (layers[layer_idx + 1]->op_type == OP_LINEAR || + layers[layer_idx + 1]->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + layers[layer_idx + 1]->op_type == + OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + layers[layer_idx + 1]->op_type == + OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION))) { + return true; + } + return false; +} + void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; - // for (auto const &l : layers) { + std::map + op_before_allreduce_tensors_to_parallel_tensors; + std::map transformer_layer_allreduce_count; + std::map transformer_layer_parallel_identity_count; for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) { auto const &l = layers[layer_idx]; std::vector inputs; @@ -3323,14 +3503,19 @@ void FFModel::create_operators_from_layers() { // create new input tensors assert(tensors_to_parallel_tensors.find(l->inputs[i]) != tensors_to_parallel_tensors.end()); - inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); + if (l->op_type == OP_LORA && + op_before_allreduce_tensors_to_parallel_tensors.find(l->inputs[i]) != + op_before_allreduce_tensors_to_parallel_tensors.end()) { + inputs.push_back( + op_before_allreduce_tensors_to_parallel_tensors[l->inputs[i]]); + } else { + inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]); + } } Op *op = nullptr; - // add a combine before arg_topk - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX || - l->op_type == OP_ARGMAX)) { + // add a combine before last arg_max / arg_topk or before second-to-last + // softmax + if (need_to_add_combine(layer_idx)) { std::vector partitioned_inputs; assert(inputs.size() == 1); Combine *comb = new Combine(*this, @@ -3353,37 +3538,97 @@ void FFModel::create_operators_from_layers() { // config.tensor_parallelism_degree); // operators.push_back(repl); // op = repl; - } else if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - // mlp layer - is_mlp_block(layer_idx) || - // llama mlp layer - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_GELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || - // LLAMA without element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 5 && - layers[layer_idx - 1]->op_type == OP_EW_MUL && - layers[layer_idx - 2]->op_type == OP_EW_MUL && - layers[layer_idx - 3]->op_type == OP_SIGMOID && - layers[layer_idx - 4]->op_type == OP_LINEAR && - layers[layer_idx - 5]->op_type == OP_LINEAR) || - // LLAMA with element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 3 && - layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && - layers[layer_idx - 2]->op_type == OP_LINEAR && - layers[layer_idx - 3]->op_type == OP_LINEAR))) { + assert(op->numOutputs == l->numOutputs); + for (int i = 0; i < op->numOutputs; i++) { + assert(tensors_to_parallel_tensors.find(l->outputs[i]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + } + } else if (need_to_add_allreduce(layer_idx)) { assert(op->numOutputs == 1); - AllReduce *allreduce = - new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1); + size_t transformer_layer_id = op->layer_guid.transformer_layer_id; + if (transformer_layer_allreduce_count.find(transformer_layer_id) == + transformer_layer_allreduce_count.end()) { + transformer_layer_allreduce_count[transformer_layer_id] = 0; + } + std::string allreduce_name = std::string( + "layers." + std::to_string(transformer_layer_id) + ".allreduce." + + std::to_string( + transformer_layer_allreduce_count[transformer_layer_id])); + transformer_layer_allreduce_count[transformer_layer_id]++; + AllReduce *allreduce = new AllReduce(*this, + op->outputs[0], + op->outputs[0]->num_dims - 1, + allreduce_name.c_str()); operators.push_back(allreduce); + op_before_allreduce_tensors_to_parallel_tensors[l->outputs[0]] = + op->outputs[0]; op = allreduce; + assert(op->numOutputs == l->numOutputs); + for (int i = 0; i < op->numOutputs; i++) { + assert(tensors_to_parallel_tensors.find(l->outputs[i]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + } + } else if (need_to_add_parallel_identity(layer_idx)) { + assert(op->numOutputs == 1 || op->numOutputs == 2); + size_t transformer_layer_id = op->layer_guid.transformer_layer_id; + if (transformer_layer_parallel_identity_count.find( + transformer_layer_id) == + transformer_layer_parallel_identity_count.end()) { + transformer_layer_parallel_identity_count[transformer_layer_id] = 0; + } + std::string parallel_identity_name = std::string( + "layers." + std::to_string(transformer_layer_id) + + ".parallel_identity." + + std::to_string( + transformer_layer_parallel_identity_count[transformer_layer_id])); + transformer_layer_parallel_identity_count[transformer_layer_id]++; + ParallelIdentity *parallel_identity = nullptr; + if (op->numOutputs == 1) { + parallel_identity = + new ParallelIdentity(*this, + op->outputs[0], + op->outputs[0]->num_dims - 1, + parallel_identity_name.c_str()); + } else if (op->numOutputs == 2) { + parallel_identity = + new ParallelIdentity(*this, + op->outputs[1], + op->outputs[1]->num_dims - 1, + parallel_identity_name.c_str()); + // output 0 is taken from the residual rms norm + assert(tensors_to_parallel_tensors.find(l->outputs[0]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[0]] = op->outputs[0]; + } else { + assert(false && + "Op needing ParallelIdentity has unexpected number of outputs"); + } + operators.push_back(parallel_identity); + assert(op->numOutputs == l->numOutputs); + // last output is taken from the parallel identity + assert(tensors_to_parallel_tensors.find(l->outputs[op->numOutputs - 1]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[l->numOutputs - 1]] = + parallel_identity->outputs[0]; + op = parallel_identity; + } else { + assert(op->numOutputs == l->numOutputs); + for (int i = 0; i < op->numOutputs; i++) { + assert(tensors_to_parallel_tensors.find(l->outputs[i]) == + tensors_to_parallel_tensors.end()); + tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + } } - assert(op->numOutputs == l->numOutputs); - for (int i = 0; i < op->numOutputs; i++) { - tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i]; + // if the operator has op_type==OP_LORA, and the second-to-last operator in + // the operators vector has op_type==OP_ALLREDUCE, move the operator before + // the ALLREDUCE + if (op->op_type == OP_LORA && operators.size() > 1 && + operators[operators.size() - 2]->op_type == OP_ALLREDUCE) { + Op *tmp = operators[operators.size() - 2]; + operators[operators.size() - 2] = operators[operators.size() - 1]; + operators[operators.size() - 1] = tmp; } } } @@ -3424,7 +3669,7 @@ void FFModel::compile(LossType loss_type, deserialize_graph_optimal_view(dez, best_graph, optimal_views); operators.clear(); convert_graph_to_operators(best_graph, optimal_views); - best_graph->print_dot(); + // best_graph->print_dot(); delete best_graph; for (auto const &layer : layers) { // map inputs to parallel tensor @@ -3549,7 +3794,7 @@ void FFModel::compile(LossType loss_type, for (int i = 0; i < op->numInputs; i++) { assert(op->inputs[i]->owner_op != nullptr); if (op->inputs[i]->owner_op->op_type == OP_INPUT) { - op->trainableInputs[i] = false; + op->trainable_inputs[i] = false; } } } @@ -3745,9 +3990,18 @@ bool FFModel::check_operators_integrity( } for (int i = 0; i < fused->op_num_outputs[op]; i++) { int my_off = fused->op_output_idx[i + ooff]; - assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT); - assert(FusedOp::use_same_regions( - fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT || + (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT && + (old_op->op_type == OP_RESIDUAL_LAYERNORM || + old_op->op_type == OP_RESIDUAL_RMS_NORM || + old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM))); + if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) { + assert(FusedOp::use_same_regions( + fused->outputs[my_off], old_op->outputs[i], pt_mapping)); + } else { + assert(FusedOp::use_same_regions( + fused->inputs[my_off], old_op->outputs[i], pt_mapping)); + } } ioff += fused->op_num_inputs[op]; woff += fused->op_num_weights[op]; @@ -4086,6 +4340,12 @@ struct DefaultConfig { const static bool searchOverlapBackwardUpdate = false; const static size_t offloadReserveSpaceSize = (size_t)8 * 1024 * 1024 * 1024; // 8 GB + // PEFT related fields + const static bool enablePeft = false; + const static size_t peftActivationReserveSpaceSize = + (size_t)1 * 1024 * 1024 * 1024; // 1GB + const static size_t peftWeightReserveSpaceSize = + (size_t)1 * 1024 * 1024 * 1024; // 1GB const static bool cpuOffload = false; const static bool onlyDataParallel = true; const static bool enableSampleParallel = true; @@ -4122,6 +4382,11 @@ FFConfig::FFConfig() { computationMode = COMP_MODE_TRAINING; cpu_offload = DefaultConfig::cpuOffload; offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize; + // PEFT related fields + enable_peft = DefaultConfig::enablePeft; + peft_activation_reserve_space_size = + DefaultConfig::peftActivationReserveSpaceSize; + peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize; quantization_type = DT_NONE; only_data_parallel = DefaultConfig::onlyDataParallel; data_parallelism_degree = 1; @@ -4248,6 +4513,18 @@ void FFConfig::parse_args(char **argv, int argc) { quantization_type = DT_INT8; continue; } + if ((!strcmp(argv[i], "-enable-peft"))) { + enable_peft = true; + continue; + } + if (!strcmp(argv[i], "-peft-activation-reserve-space-size")) { + peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; + continue; + } + if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) { + peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; + continue; + } if ((!strcmp(argv[i], "--only-data-parallel"))) { only_data_parallel = true; continue; @@ -5383,6 +5660,38 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_BWD_TASK_ID, + "residual_layernorm_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_backward_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + "residual_layernorm_peft_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "residual_layernorm_peft_bwd_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // AddBiasResidualLayerNorm task { TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID, @@ -5419,6 +5728,40 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID, + "AddBiasResidualLayerNorm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AddBiasResidualLayerNorm::backward_task>( + registrar, "AddBiasResidualLayerNorm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID, + "AddBiasResidualLayerNorm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + AddBiasResidualLayerNorm::peft_bwd_task>( + registrar, "AddBiasResidualLayerNorm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // SigmoidSiluMulti task { TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INIT_TASK_ID, @@ -5452,6 +5795,38 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_BWD_TASK_ID, + "SigmoidSiluMulti Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID, + "SigmoidSiluMulti PEFT Bwd"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "SigmoidSiluMulti PEFT Bwd Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // rms norm task { TaskVariantRegistrar registrar(RMSNORM_INIT_TASK_ID, "rmsnorm_init_task"); @@ -5495,7 +5870,36 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } - // rms norm task + { + TaskVariantRegistrar registrar(RMSNORM_BWD_TASK_ID, "RMS Norm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(RMSNORM_PEFT_BWD_TASK_ID, + "RMS Norm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "RMS Norm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // residual rms norm task { TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INIT_TASK_ID, "Residual RMS Norm Init"); @@ -5519,7 +5923,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "RMS Norm Inference Task"); + registrar, "Residual RMS Norm Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; @@ -5528,6 +5932,51 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_BWD_TASK_ID, + "Residual RMS Norm Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Residual RMS Norm Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID, + "Residual RMS Norm PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Residual RMS Norm PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID, + "layernorm_peft_bwd_task"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "peft_bwd_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -5571,6 +6020,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID, + "Linear PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Linear PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -5699,6 +6163,22 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(SOFTMAX_PEFT_BWD_TASK_ID, + "Softmax PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Softmax PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // compute Loss { TaskVariantRegistrar registrar(LOSS_BWD_TASK_ID, "Loss Backward"); @@ -6303,6 +6783,24 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar( + INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID, + "IncMultiHeadSelfAttention PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant< + IncMultiHeadSelfAttention::peft_bwd_task>( + registrar, "IncMultiHeadSelfAttention PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } // speculative MultiHeadAttention task { TaskVariantRegistrar registrar( @@ -6380,6 +6878,54 @@ void register_flexflow_internal_tasks(Runtime *runtime, TreeIncMultiHeadSelfAttention::inference_task>(registrar); } } + // PEFT tasks + // LoraLinear tasks + { + TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear Init Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID, + "LoraLinear Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(LORA_LINEAR_PEFT_BWD_TASK_ID, + "LoraLinear PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "LoraLinear PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + // NoOp { TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init"); @@ -6411,31 +6957,47 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); + TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "FusedOp Forward Task"); + Runtime::preregister_task_variant( + registrar, "FusedOp Inference Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } { - TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); + TaskVariantRegistrar registrar(FUSEDOP_PEFT_BWD_TASK_ID, + "FusedOp PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "FusedOp Inference Task"); + Runtime::preregister_task_variant( + registrar, "FusedOp PEFT Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); + } + } + + { + TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "FusedOp Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); } } { @@ -6529,6 +7091,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(COMBINE_INF_TASK_ID, "Combine Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Combine Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(COMBINE_BWD_TASK_ID, "Combine Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); @@ -6543,6 +7119,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(COMBINE_PEFT_BWD_TASK_ID, + "Combine PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Combine PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // Replicate { TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init"); @@ -6586,6 +7177,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID, + "Replicate PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "Replicate PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } // Reduction { TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init"); @@ -6644,6 +7250,34 @@ void register_flexflow_internal_tasks(Runtime *runtime, runtime->register_task_variant(registrar); } } + { + TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "AllReduce Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } { TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID, "AllReduce Inference"); @@ -6660,33 +7294,101 @@ void register_flexflow_internal_tasks(Runtime *runtime, } } { - TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); + TaskVariantRegistrar registrar(ALLREDUCE_PEFT_BWD_TASK_ID, + "AllReduce PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "AllReduce Forward Task"); + Runtime::preregister_task_variant( + registrar, "AllReduce PEFT Backward Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant(registrar); } } + // ParallelIdentity { - TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INIT_TASK_ID, + "ParallelIdentity Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); if (pre_register) { - Runtime::preregister_task_variant( - registrar, "AllReduce Backward Task"); + Runtime::preregister_task_variant( + registrar, "ParallelIdentity init Task"); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant( + registrar); } } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_FWD_TASK_ID, + "ParallelIdentity Forward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity Forward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_BWD_TASK_ID, + "ParallelIdentity Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INF_TASK_ID, + "ParallelIdentity Inference"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity Inference Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + { + TaskVariantRegistrar registrar(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID, + "ParallelIdentity PEFT Backward"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "ParallelIdentity PEFT Backward Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } + // FusedParallelOp { TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID, diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp index 62f6b89b7f..9f3e2fbb10 100644 --- a/src/runtime/model.cpp +++ b/src/runtime/model.cpp @@ -165,8 +165,8 @@ FFHandler 0, Realm::ProfilingRequestSet()) .wait(); - handle.batch_config_metadata = - workspaceInst.pointer_untyped(0, sizeof(char)); + handle.batch_config_metadata = static_cast( + workspaceInst.pointer_untyped(0, sizeof(char))); } else { handle.batch_config_metadata = nullptr; } diff --git a/src/runtime/model.cu b/src/runtime/model.cu index fd39ed0db0..5dab73e1a4 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -14,6 +14,8 @@ */ #include "flexflow/model.h" #include "flexflow/utils/cuda_helper.h" +#include "flexflow/utils/memory_allocator.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { // declare Legion names @@ -161,12 +163,51 @@ FFHandler 0, Realm::ProfilingRequestSet()) .wait(); - handle.batch_config_metadata = - workspaceInst.pointer_untyped(0, sizeof(char)); + handle.batch_config_metadata = static_cast( + workspaceInst.pointer_untyped(0, sizeof(char))); } else { handle.batch_config_metadata = nullptr; } + if (info->peft_activation_reserve_space_size > 0) { + // allocate memory for peft activation reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::RegionInstance workspaceInst; + handle.peft_activation_allocator = new MemoryAllocator(gpu_mem); + handle.peft_activation_allocator->create_legion_instance( + workspaceInst, info->peft_activation_reserve_space_size); + } else { + handle.peft_activation_allocator = nullptr; + } + + if (info->peft_weight_reserve_space_size > 0) { + // allocate memory for peft weight reserve space + Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(task->target_proc) + .first(); + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance workspaceInst; + Realm::RegionInstance::create_instance(workspaceInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); + handle.peft_weight_allocator = + new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size); + } else { + handle.peft_weight_allocator = nullptr; + } // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index 36ac02a3a3..dcac52397a 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -2,14 +2,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/simulator.h" #include - -#include -#include -#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) -#include "flexflow/utils/cuda_helper.h" -#else -#include "flexflow/utils/hip_helper.h" -#endif +#include namespace FlexFlow { @@ -25,4 +18,31 @@ size_t Op::get_params_hash() const { get_operator_type_name(this->op_type)); } +fs::path get_dst_folder(std::string const &subdir, + int step_idx, + int shard_idx, + bool before_kernel) { + std::vector debug_subdirs = {"fwd", "bwd", "optim", "weights"}; + assert(std::find(debug_subdirs.begin(), debug_subdirs.end(), subdir) != + debug_subdirs.end()); + std::string step_substr = "step_" + std::to_string(step_idx); + if (before_kernel) { + step_substr += "_pre"; + } + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + std::string debug_dir_ = + ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow" + : std::string("~/.cache/flexflow/debug/flexflow"); + wordexp_t p; + wordexp(debug_dir_.c_str(), &p, 0); + debug_dir_ = p.we_wordv[0]; + wordfree(&p); + fs::path debug_dir = debug_dir_; + assert(fs::is_directory(debug_dir)); + fs::path dst_folder = + debug_dir / subdir / step_substr / ("shard_" + std::to_string(shard_idx)); + fs::create_directories(dst_folder); + return dst_folder; +} + }; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc index 6b2d223f54..e9feb86eb5 100644 --- a/src/runtime/operator_params.cc +++ b/src/runtime/operator_params.cc @@ -42,6 +42,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -119,6 +120,8 @@ tl::optional get_op_parameters(Op const *op) { return ((Combine *)op)->get_params(); case OP_ALLREDUCE: return ((AllReduce *)op)->get_params(); + case OP_PARALLEL_IDENTITY: + return ((ParallelIdentity *)op)->get_params(); case OP_FUSED_PARALLEL: return ((FusedParallelOp *)op)->get_params(); case OP_TRANSPOSE: diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index bada87ab19..31a32dd3c8 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -14,6 +14,8 @@ */ #include "flexflow/request_manager.h" +#include "flexflow/ops/fused.h" +#include "flexflow/ops/lora_linear.h" #include "flexflow/parallel_ops/parallel_op.h" // #include "flexflow/tokenizers.h" #include @@ -21,6 +23,7 @@ #include #include #include +#include #include #include @@ -28,12 +31,16 @@ namespace FlexFlow { using namespace Legion; using tokenizers::Tokenizer; +using json = nlohmann::json; Legion::Logger log_req_mgr("RequestManager"); std::string LoadBytesFromFile(std::string const &path) { std::ifstream fs(path, std::ios::in | std::ios::binary); - assert(!fs.fail() && "no such file"); + if (fs.fail()) { + std::cerr << "Failed to open file: " << path << std::endl; + assert(false); + } std::string data; fs.seekg(0, std::ios::end); size_t size = static_cast(fs.tellg()); @@ -43,6 +50,52 @@ std::string LoadBytesFromFile(std::string const &path) { return data; } +std::ostream &operator<<(std::ostream &os, Request const &req) { + os << "Request {\n"; + os << " guid: " << req.guid << "\n"; + os << " peft_model_id: " << req.peft_model_id << "\n"; + os << " max_sequence_length: " << req.max_sequence_length << "\n"; + os << " initial_len: " << req.initial_len << "\n"; + os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; + os << " llm_cache_size: " << req.llm_cache_size << "\n"; + os << " status: " << static_cast(req.status) << "\n"; + os << " tokens: ["; + for (auto const &token : req.tokens) { + os << token << " "; + } + os << "]\n"; + os << " prompt: " << req.prompt << "\n"; + // os << " beam_trees: ["; + // for (const auto& tree : req.beam_trees) { + // // Assuming BeamTree has its own << operator defined + // os << tree << " "; + // } + // os << "]\n"; + os << " req_type: " << static_cast(req.req_type) << "\n"; + os << " completed_training_steps: " << req.completed_training_steps << "\n"; + os << " gradient_accumulation_steps: " << req.gradient_accumulation_steps + << "\n"; + os << " max_training_steps: " << req.max_training_steps << "\n"; + os << " dataset_filepath: " << req.dataset_filepath << "\n"; + os << " dataset: ["; + for (auto const &pair : req.dataset) { + os << "["; + for (auto const &token : pair.first) { + os << token << " "; + } + os << "], ["; + for (auto const &token : pair.second) { + os << token << " "; + } + os << "] "; + } + os << "]\n"; + os << "}\n"; + return os; +} + +bool RequestManager::inference_finished = false; + RequestManager::RequestManager() : request_manager_status(INITIALIZED), verbose(false), next_available_guid(1000000), num_processed_requests(0), @@ -114,6 +167,14 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) { spec_infer_tree_width.emplace_back(tree_width); } +void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) { + enable_peft_finetuning = enable_peft_finetuning_; +} + +void RequestManager::set_inference_finished(bool finished) { + inference_finished = finished; +} + void RequestManager::register_tokenizer(ModelType type, int bos_token_id, int eos_token_id, @@ -121,33 +182,45 @@ void RequestManager::register_tokenizer(ModelType type, this->model_type = type; this->bos_token_id = bos_token_id; this->eos_token_id = eos_token_id; - std::string tokenizer_folder = - (!path.empty() && path.back() != '/') ? path + '/' : path; + std::filesystem::path tokenizer_folder(path); + if (model_type == ModelType::LLAMA) { - bool path_to_file = !path.empty() && - (path.size() >= strlen("tokenizer.model")) && - path.find("tokenizer.model") == - (path.size() - strlen("tokenizer.model")); - std::string tokenizer_filepath = - path_to_file ? path : tokenizer_folder + "tokenizer.model"; - this->tokenizer_ = - Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath)); + std::filesystem::path tokenizer_model_path; + if (std::filesystem::is_directory(tokenizer_folder)) { + tokenizer_model_path = + std::filesystem::path(tokenizer_folder) / "tokenizer.model"; + } else { + tokenizer_model_path = tokenizer_folder; + } + if (std::filesystem::exists(tokenizer_model_path)) { + // load from tokenizer.model + this->tokenizer_ = Tokenizer::FromBlobSentencePiece( + LoadBytesFromFile(tokenizer_model_path.string())); + } else { + // load from tokenizer.json + std::filesystem::path tokenizer_json_path = + tokenizer_folder / "tokenizer.json"; + if (!std::filesystem::exists(tokenizer_json_path)) { + std::cerr << "Failed to open file: " << tokenizer_json_path + << std::endl; + assert(false); + } + this->tokenizer_ = Tokenizer::FromBlobJSON( + LoadBytesFromFile(tokenizer_json_path.string())); + } } else if (model_type == ModelType::OPT) { - std::string vocab_file = tokenizer_folder + "vocab.json"; - std::string merges_file = tokenizer_folder + "merges.txt"; - std::string added_tokens_file = - tokenizer_folder + "special_tokens_map.json"; - std::filesystem::path path1(vocab_file); - std::filesystem::path path2(merges_file); - std::filesystem::path path3(added_tokens_file); - assert(std::filesystem::exists(path1) && + std::filesystem::path vocab_file = tokenizer_folder / "vocab.json"; + std::filesystem::path merges_file = tokenizer_folder / "merges.txt"; + std::filesystem::path added_tokens_file = + tokenizer_folder / "special_tokens_map.json"; + assert(std::filesystem::exists(vocab_file) && "Vocab file vocab.json does not exist at the specified path"); - assert(std::filesystem::exists(path2) && + assert(std::filesystem::exists(merges_file) && "Merge file merges.txt does not exist at the specified path"); // opt_tokenizer = new OptTokenizer(vocab_file, merges_file); - std::string vocab = LoadBytesFromFile(path1.string()); - std::string merges = LoadBytesFromFile(path2.string()); - std::string added_tokens = LoadBytesFromFile(path3.string()); + std::string vocab = LoadBytesFromFile(vocab_file.string()); + std::string merges = LoadBytesFromFile(merges_file.string()); + std::string added_tokens = LoadBytesFromFile(added_tokens_file.string()); this->tokenizer_ = Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens); @@ -182,28 +255,40 @@ size_t RequestManager::get_num_ssms() { } RequestManager::RequestGuid - RequestManager::register_new_request(std::vector const &prompt, - int max_sequence_length) { + RequestManager::register_new_request(Request const &request_) { const std::lock_guard lock(request_queue_mutex); - // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - - if (prompt.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << prompt.size() << ".\n"; - - printf("tokens size: %zu\n", request.tokens.size()); - return INVALID_GUID; + request.max_sequence_length = request_.max_sequence_length; + request.peft_model_id = request_.peft_model_id; + request.warmup = request_.warmup; + if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + request.tokens.push_back(bos_token_id); + } + if (request_.benchmarking_tokens >= 0) { + assert(request_.benchmarking_tokens < get_max_sequence_length()); + request.benchmarking_tokens = request_.benchmarking_tokens; + request.tokens.insert(request.tokens.end(), + request_.benchmarking_tokens, + 15); // insert random number } else { - request.initial_len = prompt.size(); - request.tokens = prompt; + std::vector tokens = this->tokenizer_->Encode(request_.prompt); + if (tokens.size() >= get_max_sequence_length()) { + std::cout << "Warning: too many tokens in prompt, only load up to " + << get_max_sequence_length() << " tokens, but got " + << tokens.size() << ".\n"; + return INVALID_GUID; + } + for (int i = 0; i < tokens.size(); i++) { + std::cout << "[" << i << "]" << tokens.at(i) << "\n"; + } + request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); } + request.initial_len = request.tokens.size(); + if (get_num_ssms() == 0) { std::cout << "No small speculative model registered, using incremental " "decoding." @@ -216,58 +301,111 @@ RequestManager::RequestGuid } } - pending_request_queue.push(request); + pending_infr_request_queue.push(request); all_requests[request.guid] = request; { const std::lock_guard lock(request_to_promise_mutex); request_to_promise[request.guid] = new std::promise(); } - if (verbose) { - std::cout << "new req: " << request.tokens.size() << std::endl; + { + std::string output = "New request tokens:"; + output = "[" + std::to_string(request.guid) + "]" + output; for (int i = 0; i < request.tokens.size(); i++) { - std::cout << i << " : " << request.tokens[i] << std::endl; + output = output + " " + std::to_string(request.tokens[i]); } + log_req_mgr.print("%s", output.c_str()); } GenerationResult gr; gr.guid = request.guid; - gr.input_text = ""; - gr.input_tokens = prompt; - gr.output_text = ""; - gr.output_tokens = prompt; + gr.input_text = request_.prompt; + gr.input_tokens = request.tokens; + gr.output_text = request_.prompt; + gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; + ProfileInfo profile_info; + profile_info.registration_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[request.guid] = profile_info; + return request.guid; } RequestManager::RequestGuid - RequestManager::register_new_request(std::string const &prompt, - int max_sequence_length) { + RequestManager::register_new_peft_request(Request const &request_) { + assert(enable_peft_finetuning && "PEFT finetuning is not enabled"); const std::lock_guard lock(request_queue_mutex); // Add a new request Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = max_sequence_length; - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { - request.tokens.push_back(bos_token_id); + request.initial_len = 0; + request.max_sequence_length = request_.max_sequence_length; + request.peft_model_id = request_.peft_model_id; + request.req_type = RequestType::REQ_FINETUNING; + request.completed_training_steps = 0; + request.gradient_accumulation_steps = request_.gradient_accumulation_steps; + request.max_training_steps = request_.max_training_steps; + request.dataset_filepath = request_.dataset_filepath; + request.warmup = request_.warmup; + + // Load dataset + if (request_.benchmarking_tokens >= 0) { + assert(request_.benchmarking_tokens <= get_max_sequence_length()); + request.benchmarking_tokens = request_.benchmarking_tokens; + std::vector input_tokens; + std::vector output_tokens; + bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON); + if (bos_added) { + input_tokens.push_back(bos_token_id); + } + input_tokens.insert(input_tokens.end(), + request_.benchmarking_tokens - (int)bos_added, + 15); // insert random number + request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); + } else { + using json = nlohmann::json; + std::ifstream file_handle(request.dataset_filepath); + assert(file_handle.good() && "Dataset file does not exist."); + json dataset_json = json::parse(file_handle, + /*parser_callback_t */ nullptr, + /*allow_exceptions */ true, + /*ignore_comments */ true); + + for (auto &prompt : dataset_json) { + std::string text = prompt.get(); + std::string output_text(""); + std::vector input_tokens; + input_tokens = this->tokenizer_->Encode(text); + if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + input_tokens.insert(input_tokens.begin(), bos_token_id); + } + std::vector output_tokens = + this->tokenizer_->Encode(output_text); + if (input_tokens.size() + output_tokens.size() > + get_max_sequence_length()) { + std::cout << "Warning: too many tokens in sample, only load up to " + << get_max_sequence_length() << " tokens, but got " + << input_tokens.size() + output_tokens.size() << ".\n"; + return INVALID_GUID; + } else { + request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); + } + } } - std::vector tokens = this->tokenizer_->Encode(prompt); - if (tokens.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; - printf("tokens size: %zu\n", tokens.size()); - return INVALID_GUID; + if (request.gradient_accumulation_steps == -1) { + request.gradient_accumulation_steps = request.dataset.size(); } - for (int i = 0; i < tokens.size(); i++) { - std::cout << "[" << i << "]" << tokens.at(i) << "\n"; - } - request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end()); - request.initial_len = request.tokens.size(); + assert(request.gradient_accumulation_steps > 0 && + "Invalid gradient accumulation steps"); + assert(request.gradient_accumulation_steps <= request.max_training_steps && + "Gradient accumulation steps should be less than or equal to max " + "training steps"); + // Currently don't support speculative inference for PEFT + assert(get_num_ssms() == 0); if (get_num_ssms() == 0) { std::cout << "No small speculative model registered, using incremental " "decoding." @@ -280,29 +418,38 @@ RequestManager::RequestGuid } } - pending_request_queue.push(request); + pending_peft_request_queue.push(request); all_requests[request.guid] = request; { const std::lock_guard lock(request_to_promise_mutex); request_to_promise[request.guid] = new std::promise(); } - { - std::string output = "New request tokens:"; - output = "[" + std::to_string(request.guid) + "]" + output; - for (int i = 0; i < request.tokens.size(); i++) { - output = output + " " + std::to_string(request.tokens[i]); + for (size_t r = 0; r < request.dataset.size(); r++) { + std::string input = "[" + std::to_string(r) + "] input:"; + std::string output = "[" + std::to_string(r) + "] output:"; + for (size_t i = 0; i < request.dataset[r].first.size(); i++) { + input = input + " " + std::to_string(request.dataset[r].first[i]); } + for (size_t i = 0; i < request.dataset[r].second.size(); i++) { + output = output + " " + std::to_string(request.dataset[r].second[i]); + } + log_req_mgr.print("%s", input.c_str()); log_req_mgr.print("%s", output.c_str()); } GenerationResult gr; gr.guid = request.guid; - gr.input_text = prompt; - gr.input_tokens = request.tokens; - gr.output_text = prompt; - gr.output_tokens = request.tokens; + // gr.input_text = prompt; + // gr.input_tokens = request.tokens; + // gr.output_text = prompt; + // gr.output_tokens = request.tokens; request_generation_results[request.guid] = gr; + + ProfileInfo profile_info; + profile_info.registration_time = Realm::Clock::current_time_in_microseconds(); + profiling_requests[request.guid] = profile_info; + return request.guid; } @@ -363,51 +510,117 @@ BatchConfig RequestManager::prepare_next_batch_task( return rm->prepare_next_batch(*bc, result); } +bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc, + int i) { + Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + bool request_completed = false; + // printf("model_type = %d\n", this->model_type); + if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + request_completed = true; + } else if (request.tokens.back() == eos_token_id) { + // Encounter EOS token id + request_completed = true; + } + return request_completed; +} + +void RequestManager::check_batch(BatchConfig const &old_bc, + BatchConfig const &new_bc) { + int num_incomplete_prompts = 0; + for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + if (new_bc.request_completed[i]) { + continue; + } + // ensure there is no request with zero tokens + assert(new_bc.requestsInfo[i].num_tokens_in_batch > 0); + // ensure there is no more than one incomplete prompt + if (new_bc.requestsInfo[i].prompt_phase && + new_bc.requestsInfo[i].num_tokens_in_batch + + new_bc.requestsInfo[i].first_token_depth_in_request < + all_requests[new_bc.requestsInfo[i].request_guid].tokens.size()) { + num_incomplete_prompts++; + } + } + if (num_incomplete_prompts > 1) { + std::cout << "Error: more than one incomplete prompt in the batch\n"; + pid_t pid = getpid(); + std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt"; + std::ofstream filen(filenamen); + if (filen.is_open()) { + filen << new_bc << std::endl; + filen.close(); + std::cout << "String written to file: " << filenamen << std::endl; + } else { + std::cout << "Unable to open file: " << filenamen << std::endl; + } + std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt"; + std::ofstream fileo(filenameo); + if (fileo.is_open()) { + fileo << old_bc << std::endl; + fileo.close(); + std::cout << "String written to file: " << filenameo << std::endl; + } else { + std::cout << "Unable to open file: " << filenameo << std::endl; + } + assert(false); + } +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); - // Step 1: append result from previous iteration to request's tokens - for (int i = 0; i < old_bc.num_tokens; i++) { + for (int i = 0; i < old_bc.num_active_tokens(); i++) { size_t guid = old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid; Request &request = all_requests[guid]; + if (request.req_type == RequestType::REQ_FINETUNING) { + continue; + } if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) { // This is a prompt token continue; } else { + // This is a decoding token assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 == request.tokens.size()); - // This is a decoding token + if (!profiling_requests[guid].first_token_time_set) { + profiling_requests[guid].first_token_time = + Realm::Clock::current_time_in_microseconds(); + profiling_requests[guid].first_token_time_set = true; + } log_req_mgr.print("Output token is: %d", result.token_ids[i]); request.tokens.push_back(result.token_ids[i]); // std::string output = this->tokenizer_->Decode(request.tokens); // log_req_mgr.print("Output: %s", output.c_str()); } } + int num_generation_tokens = 0; int num_active_req = -1; - // Step 2: prepare the next batch for existing requests + // when finetuning is enabled, the last entry in the batch cannot be used for + // inference + int inference_batch_size = + BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning; + + // Step 2: prepare the next batch for existing inference requests BatchConfig new_bc; - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { - if (old_bc.request_completed[i]) { // add new requests to the next batch + for (int i = 0; i < inference_batch_size; i++) { + if (old_bc.request_completed[i]) { + // no need to carry over tokens to new batch for this request continue; } else { assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0); Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; + assert(request.req_type == RequestType::REQ_INFERENCE && + "Found misplaced finetuning request"); + int processed_tokens = old_bc.requestsInfo[i].first_token_depth_in_request + old_bc.requestsInfo[i].num_tokens_in_batch; assert(processed_tokens < request.tokens.size()); - bool request_completed = false; - // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { - request_completed = true; - } else if (request.tokens.back() == eos_token_id) { - // Encounter EOS token id - request_completed = true; - } + bool request_completed = check_inf_req_completion(old_bc, i); if (request_completed) { std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically @@ -435,32 +648,40 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, total_request_run_time += profile_info.finish_time - profile_info.start_time; profiling_requests[request.guid] = profile_info; - log_req_mgr.print( - "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " - "finish(%.1lf) latency(%.1lf)", - request.guid, - profile_info.llm_decoding_steps, - profile_info.start_time, - profile_info.finish_time, - profile_info.finish_time - profile_info.start_time); + log_req_mgr.print("[%s] guid(%zu) llm_decoding_steps(%d) start(%.1lf) " + "finish(%.1lf) latency(%.1lf) ttft(%.1lf)", + request.warmup ? "Warmup" : "Profile", + request.guid, + profile_info.llm_decoding_steps, + profile_info.start_time, + profile_info.finish_time, + profile_info.finish_time - profile_info.start_time, + profile_info.first_token_time - + profile_info.registration_time); // Write output to file if needed: if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; - outputFile << "token IDs: "; - for (int i = 0; i < request.tokens.size(); i++) { - outputFile << request.tokens[i]; - if (i < request.tokens.size() - 1) { - outputFile << ","; + outputFile << "[" << (request.warmup ? "Warmup" : "Profile") + << "] guid(" << request.guid << ") llm_decoding_steps(" + << profile_info.llm_decoding_steps << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ") ttft(" << std::fixed << std::setprecision(3) + << (profile_info.first_token_time - + profile_info.registration_time) + << ")\n"; + if (request.benchmarking_tokens <= 0) { + outputFile << "token IDs: "; + for (int i = 0; i < request.tokens.size(); i++) { + outputFile << request.tokens[i]; + if (i < request.tokens.size() - 1) { + outputFile << ","; + } } + outputFile << std::endl; + outputFile << output; } - outputFile << std::endl; - outputFile << output; outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath @@ -468,13 +689,15 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(false); } } - } else { new_bc.request_completed[i] = false; new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; + new_bc.requestsInfo[i].peft_model_id = + old_bc.requestsInfo[i].peft_model_id; + new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_sequence_length = old_bc.requestsInfo[i].max_sequence_length; num_active_req++; @@ -487,8 +710,25 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].prompt_phase = false; } else { // Prompt phase + assert(old_bc.requestsInfo[i].prompt_phase == true); + int space_for_incr_dec_requests = 0; + // If the prompt can't fit in the batch, compute how much space we + // need to leave out for incomplete requests in decoding phase at + // higher indices. + for (int ii = i + 1; ii < inference_batch_size; ii++) { + if (old_bc.request_completed[ii]) { + continue; + } + Request &old_request = + all_requests[old_bc.requestsInfo[ii].request_guid]; + bool req_completed = check_inf_req_completion(old_bc, ii); + if (!req_completed) { + space_for_incr_dec_requests++; + } + } new_bc.requestsInfo[i].num_tokens_in_batch = - std::min(get_max_tokens_per_batch() - new_bc.num_tokens, + std::min(get_max_tokens_per_batch() - new_bc.num_tokens - + space_for_incr_dec_requests, (int)request.tokens.size() - new_bc.requestsInfo[i].first_token_depth_in_request); new_bc.requestsInfo[i].prompt_phase = true; @@ -509,13 +749,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } new_bc.num_generation_tokens = num_generation_tokens; - // Step 3: add new requests to the next batch - for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) { + // Step 3: add new inference requests to the next batch if there is space + for (int i = 0; i < inference_batch_size; i++) { if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && + if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); + Request new_request = pending_infr_request_queue.front(); + assert(new_request.req_type == RequestType::REQ_INFERENCE); + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; new_bc.requestsInfo[i].first_token_depth_in_request = 0; @@ -526,15 +767,16 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_sequence_length = new_request.max_sequence_length; + new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; + new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; new_bc.requestsInfo[i].prompt_phase = true; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; - // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 1; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; + // add start time to profile_info for the new request + profiling_requests[new_request.guid].llm_decoding_steps = 1; + profiling_requests[new_request.guid].start_time = + Realm::Clock::current_time_in_microseconds(); for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) { int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j; new_bc.tokensInfo[new_bc.num_tokens].request_index = i; @@ -551,6 +793,170 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } } + if (enable_peft_finetuning && + !old_bc.request_completed[inference_batch_size]) { + assert(old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch > 0); + Request &request = + all_requests[old_bc.requestsInfo[inference_batch_size].request_guid]; + assert(request.req_type == RequestType::REQ_FINETUNING && + "Found misplaced inference request"); + + request.finetuning_losses.push_back(result.finetuning_loss); + + request.dataset_entry_processed_tokens += + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; + request.processed_finetuning_tokens += + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch; + request.finetuning_tokens_per_batch.push_back( + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch); + int dataset_entry = + request.completed_training_steps % request.dataset.size(); + if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request + + old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch == + request.dataset[dataset_entry].first.size()) { + // completed the current dataset entry + assert(request.dataset_entry_processed_tokens == + request.dataset[dataset_entry].first.size()); + request.completed_training_steps += 1; + request.dataset_entry_processed_tokens = 0; + } + + assert(request.completed_training_steps <= request.max_training_steps); + if (request.completed_training_steps == request.max_training_steps || + inference_finished) { + // check if the fine tuning request has completed + request.status = Request::COMPLETED; + + GenerationResult &gr = request_generation_results[request.guid]; + assert(gr.guid == request.guid); + gr.finetuning_losses = request.finetuning_losses; + trigger_request_completion_future(request.guid); + num_processed_requests++; + + ProfileInfo profile_info = profiling_requests[request.guid]; + profile_info.finish_time = Realm::Clock::current_time_in_microseconds(); + total_request_run_time += + profile_info.finish_time - profile_info.start_time; + profiling_requests[request.guid] = profile_info; + log_req_mgr.print("[%s] guid(%zu) completed_training_steps(%d) " + "processed_finetuning_tokens(%lu) latency(%.1lf)", + request.warmup ? "Warmup" : "Finetuning", + request.guid, + request.completed_training_steps, + request.processed_finetuning_tokens, + profile_info.finish_time - profile_info.start_time); + if (!output_filepath.empty()) { + std::ofstream outputFile(output_filepath, std::ios::app); + if (outputFile.is_open()) { + std::string tokens_str = "["; + for (size_t i = 0; i < request.finetuning_tokens_per_batch.size(); + i++) { + tokens_str += + std::to_string(request.finetuning_tokens_per_batch[i]); + if (i != request.finetuning_tokens_per_batch.size() - 1) { + tokens_str += ", "; + } + } + tokens_str += "]"; + outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning") + << "] guid(" << request.guid + << ") completed_training_steps(" + << request.completed_training_steps + << ") processed_finetuning_tokens(" + << request.processed_finetuning_tokens << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ") tokens_per_batch(" << tokens_str << ")\n"; + outputFile.close(); + } else { + std::cout << "Unable to open the output file: " << output_filepath + << std::endl; + assert(false); + } + } + } + } + + // Step 4: add PEFT bwd requests, if there is additional space + while (pending_peft_request_queue.size() > 0) { + Request &request = pending_peft_request_queue.front(); + // assert(request.req_type = RequestType::REQ_FINETUNING); + Request &all_req_handle = all_requests[request.guid]; + // assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); + if (all_req_handle.status == Request::COMPLETED) { + pending_peft_request_queue.pop(); + } else { + break; + } + } + + if (pending_peft_request_queue.size() > 0 && !inference_finished) { + Request &request = pending_peft_request_queue.front(); + assert(request.req_type = RequestType::REQ_FINETUNING); + assert(request.dataset.size() > 0); + // update status and training steps + Request &all_req_handle = all_requests[request.guid]; + assert(all_req_handle.req_type = RequestType::REQ_FINETUNING); + + request.completed_training_steps = all_req_handle.completed_training_steps; + request.processed_finetuning_tokens = + all_req_handle.processed_finetuning_tokens; + request.status = all_req_handle.status; + int dataset_entry = + request.completed_training_steps % request.dataset.size(); + request.dataset_entry_processed_tokens = + all_req_handle.dataset_entry_processed_tokens; + request.gradient_accumulation_steps = + all_req_handle.gradient_accumulation_steps; + + assert(request.status != Request::COMPLETED); + assert(request.max_training_steps > 0 && + request.completed_training_steps < request.max_training_steps); + assert(request.dataset_entry_processed_tokens <= + request.dataset[dataset_entry].first.size()); + + int num_peft_tokens = + min((int)request.dataset[dataset_entry].first.size() - + request.dataset_entry_processed_tokens, + get_max_tokens_per_batch() - new_bc.num_active_infr_tokens()); + int num_peft_label_tokens = request.dataset[dataset_entry].second.size(); + assert(num_peft_label_tokens == 0); + + if (num_peft_tokens > 0) { + assert(new_bc.request_completed[inference_batch_size]); + // request info + new_bc.request_completed[inference_batch_size] = false; + new_bc.requestsInfo[inference_batch_size].first_token_depth_in_request = + request.dataset_entry_processed_tokens; + new_bc.requestsInfo[inference_batch_size].first_token_offset_in_batch = + new_bc.num_active_infr_tokens(); + new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch = + num_peft_tokens; + new_bc.requestsInfo[inference_batch_size].max_sequence_length = + request.max_sequence_length; + new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; + new_bc.requestsInfo[inference_batch_size].peft_model_id = + request.peft_model_id; + new_bc.requestsInfo[inference_batch_size].peft_bwd = true; + set_optimizer_tasks( + new_bc.requestsInfo[inference_batch_size].optimizer_tasks, + request.max_training_steps, + request.completed_training_steps, + request.gradient_accumulation_steps); + // tokens info + for (size_t i = request.dataset_entry_processed_tokens; + i < request.dataset_entry_processed_tokens + num_peft_tokens; + i++) { + new_bc.tokensInfo[new_bc.num_tokens].token_id = + request.dataset[dataset_entry].first[i]; + new_bc.tokensInfo[new_bc.num_tokens].request_index = + inference_batch_size; + new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i; + new_bc.num_tokens++; + new_bc.num_peft_tokens++; + } + } + } return new_bc; } @@ -722,11 +1128,17 @@ BeamSearchBatchConfig if (!output_filepath.empty()) { std::ofstream outputFile(output_filepath, std::ios::app); if (outputFile.is_open()) { - outputFile << "end-to-end latency: " << std::fixed - << std::setprecision(3) << total_request_run_time - << std::endl; - outputFile << "num decoding steps: " - << profile_info.llm_decoding_steps << std::endl; + outputFile << "[Profile] guid(" << request.guid + << ") llm_decoding_steps(" + << profile_info.llm_decoding_steps << ") latency(" + << std::fixed << std::setprecision(3) + << (profile_info.finish_time - profile_info.start_time) + << ")\n"; + // outputFile << "end-to-end latency: " << std::fixed + // << std::setprecision(3) << total_request_run_time + // << std::endl; + // outputFile << "num decoding steps: " + // << profile_info.llm_decoding_steps << std::endl; outputFile << "token IDs: "; for (int i = 0; i < request.tokens.size(); i++) { outputFile << request.tokens[i]; @@ -736,7 +1148,6 @@ BeamSearchBatchConfig } outputFile << std::endl; outputFile << output; - outputFile.close(); } else { std::cout << "Unable to open the output file: " << output_filepath @@ -884,10 +1295,10 @@ BeamSearchBatchConfig // Step 2: Initialize new request for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) { if (new_bc.request_completed[i]) { - if (!pending_request_queue.empty() && + if (!pending_infr_request_queue.empty() && new_bc.num_tokens < get_max_tokens_per_batch()) { - Request new_request = pending_request_queue.front(); - pending_request_queue.pop(); + Request new_request = pending_infr_request_queue.front(); + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; num_active_req++; new_bc.requestsInfo[i].first_token_depth_in_request = 0; @@ -901,13 +1312,13 @@ BeamSearchBatchConfig new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request - ProfileInfo profile_info; - profile_info.llm_decoding_steps = 0; - profile_info.ssm_decoding_steps = 0; - profile_info.start_time = Realm::Clock::current_time_in_microseconds(); - profiling_requests[new_request.guid] = profile_info; + profiling_requests[new_request.guid].llm_decoding_steps = 0; + profiling_requests[new_request.guid].ssm_decoding_steps = 0; + profiling_requests[new_request.guid].start_time = + Realm::Clock::current_time_in_microseconds(); // init the beam search metadata per request - int ssm_decoding_steps = profile_info.ssm_decoding_steps; + int ssm_decoding_steps = + profiling_requests[new_request.guid].ssm_decoding_steps; new_bc.beamRequestsInfo[i].beam_size = spec_infer_tree_width.size() > ssm_decoding_steps @@ -1552,7 +1963,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; - new_bc.requestsInfo[i].num_tokens_in_batch = std::min(max_prompt_load_size, (int)request.initial_len - @@ -2105,7 +2515,7 @@ std::vector> // must in this branch. int layer_slot = i - processed_whole_layer_tokens; int layer_slot_total = treeLayers[layer_num]; - if ((first_layer_slot == layer_slot)) { + if (first_layer_slot == layer_slot) { verifiedTree.push_back(output); new_committed_tokens.push_back(std::make_pair( input.second, committed_tokens.at(guid).at(i).second)); @@ -2297,19 +2707,34 @@ std::vector> } std::vector - FFModel::generate(std::vector &prompts, int max_seq_length) { + FFModel::generate(std::vector const &requests) { RequestManager *rm = RequestManager::get_request_manager(); - std::vector guids; - for (int i = 0; i < prompts.size(); i++) { - RequestManager::RequestGuid guid = - rm->register_new_request(prompts.at(i), max_seq_length); - if (guid != RequestManager::INVALID_GUID) { - guids.push_back(guid); + // reset inference_finished flag + rm->set_inference_finished(false); + std::vector inf_guids, peft_guids; + for (int i = 0; i < requests.size(); i++) { + RequestManager::RequestGuid guid; + if (requests.at(i).req_type == RequestType::REQ_INFERENCE) { + guid = rm->register_new_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + inf_guids.push_back(guid); + } + } else { + guid = rm->register_new_peft_request(requests.at(i)); + if (guid != RequestManager::INVALID_GUID) { + peft_guids.push_back(guid); + } } } std::vector results; - for (int i = 0; i < guids.size(); i++) { - results.push_back(rm->get_generation_result(guids[i])); + for (int i = 0; i < inf_guids.size(); i++) { + results.push_back(rm->get_generation_result(inf_guids[i])); + } + if (inf_guids.size() > 0) { + rm->set_inference_finished(); + } + for (int i = 0; i < peft_guids.size(); i++) { + results.push_back(rm->get_generation_result(peft_guids[i])); } return results; } @@ -2342,6 +2767,18 @@ void RequestManager::background_serving_task( std::vector const ®ions, Context ctx, Runtime *runtime) { + + auto print_timestamped_message = [](std::string const &message) { + auto now = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - " + << message << std::endl; + }; + + // Print at the start of the task + print_timestamped_message( + "###PEFT DEBUGGING### Starting background serving task."); + RequestManager *rm = RequestManager::get_request_manager(); FFModel *llm = *(FFModel **)task->args; { @@ -2358,6 +2795,11 @@ void RequestManager::background_serving_task( ssm->config.lg_ctx = ctx; } } + + // Checkpoint print + print_timestamped_message( + "###PEFT DEBUGGING### Updated models' configuration."); + if (rm->get_num_ssms() == 0) { // No SSMs: perform incremental decoding rm->serve_incr_decoding(llm); @@ -2365,13 +2807,48 @@ void RequestManager::background_serving_task( // Registered SSMs: perform speculative inference rm->serve_spec_infer(llm); } + #ifdef FF_USE_NCCL llm->finish_nccl_comms(); #endif + + // Print at the end of the task + print_timestamped_message( + "###PEFT DEBUGGING### Background serving task completed."); +} + +std::string find_layer_name_from_guid(FFModel *model, LayerID guid) { + for (size_t i = 0; i < model->layers.size(); i++) { + if (model->layers[i]->layer_guid == guid) { + std::string layer_name(model->layers[i]->name); + return layer_name; + } + } + assert(false); + return "invalid_layer_name"; +} + +bool is_peft_operator_type(OperatorType type) { + switch (type) { + case OP_LORA: + return true; + default: + return false; + } } /*static*/ void RequestManager::serve_incr_decoding(FFModel *llm) { + + // Check if the model object exists + if (llm == nullptr) { + std::cout << "###PEFT DEBUGGING### LLM Model object does not exist." + << std::endl; + return; // Early return to prevent further operations on a nullptr + } else { + std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl; + } + Context ctx = llm->config.lg_ctx; Runtime *runtime = llm->config.lg_hlr; // Compile the llm @@ -2419,6 +2896,9 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { BatchConfigFuture bcf = prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime); FutureMap fm = im->inference(llm, 0, bcf); + if (llm->config.enable_peft) { + im->peft_bwd(llm, 0, bcf); + } assert(fm.get_future_map_domain().get_volume() == 1); InferenceResultFuture irf = fm.get_future(0); batch_pipeline.push(std::make_pair(bcf, irf)); diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp index fadbf80d6d..8e5f302466 100644 --- a/src/runtime/request_manager.cpp +++ b/src/runtime/request_manager.cpp @@ -73,74 +73,69 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - size_t total_copy_size = 0; - checkCUDA(hipMemcpyAsync(handle.batch_config_metadata, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->tokens_info, &(batch_config->tokensInfo), sizeof(BatchConfig::tokensInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::tokensInfo); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->requestsInfo, &(batch_config->requestsInfo), sizeof(BatchConfig::requestsInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamTokenInfo, &(beam_batch_config->beamTokenInfo), sizeof(BeamSearchBatchConfig::beamTokenInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo, &(beam_batch_config->beamRequestsInfo), sizeof(BeamSearchBatchConfig::beamRequestsInfo), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask, &(beam_batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + hipMemcpyHostToDevice, + stream)); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask, &(tree_batch_config->causalMask), sizeof(BatchConfig::causalMask), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - checkCUDA(hipMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->committed_tokens, &(tree_batch_config->committed_tokens), sizeof(TreeVerifyBatchConfig::committed_tokens), hipMemcpyHostToDevice, stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - } - // add a size check - assert(total_copy_size <= handle.batch_config_metadata_size); + checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + hipMemcpyHostToDevice, + stream)); + } } void RequestManager::load_positions_task( diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu index 8380d6be73..343f1dd6e6 100644 --- a/src/runtime/request_manager.cu +++ b/src/runtime/request_manager.cu @@ -40,8 +40,21 @@ void RequestManager::load_tokens_task( printf("Warning: too many tokens in prompt, only load up to %d tokens\n", BatchConfig::max_tokens_per_batch()); printf("Got: %d tokens\n", batch_config->num_tokens); + + // pid_t pid = getpid(); + // std::string filename = "bc_" + std::to_string(pid) + ".txt"; + // std::ofstream file(filename); + // if (file.is_open()) { + // file << *batch_config << std::endl; + // file.close(); + // std::cout << "String written to file: " << filename << std::endl; + // } else { + // std::cout << "Unable to open file: " << filename << std::endl; + // } + } else if (batch_config->num_tokens > - BatchConfig::max_verify_tokens_per_batch()) { + BatchConfig::max_verify_tokens_per_batch() && + batch_config->get_mode() != INC_DECODING_MODE) { printf("Warning: Speculative decoding. too many tokens in prompt, only " "load up to %d tokens\n", BatchConfig::max_verify_tokens_per_batch()); @@ -80,91 +93,69 @@ void RequestManager::load_batch_config_task( // copy meta data to workSpace FFHandler handle = *((FFHandler const *)task->local_args); - size_t total_copy_size = 0; - checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata, + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->tokens_info, &(batch_config->tokensInfo), sizeof(BatchConfig::tokensInfo), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::tokensInfo); - checkCUDA(cudaMemcpyAsync(static_cast(handle.batch_config_metadata) + - total_copy_size, + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->requestsInfo, &(batch_config->requestsInfo), sizeof(BatchConfig::requestsInfo), cudaMemcpyHostToDevice, stream)); - total_copy_size += sizeof(BatchConfig::requestsInfo); // load speculative metadata if (batch_config->get_mode() == BEAM_SEARCH_MODE) { BeamSearchBatchConfig const *beam_batch_config = static_cast(batch_config); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamTokenInfo), - sizeof(BeamSearchBatchConfig::beamTokenInfo), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->beamRequestsInfo), - sizeof(BeamSearchBatchConfig::beamRequestsInfo), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(beam_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BatchConfig::request_completed); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamTokenInfo, + &(beam_batch_config->beamTokenInfo), + sizeof(BeamSearchBatchConfig::beamTokenInfo), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo, + &(beam_batch_config->beamRequestsInfo), + sizeof(BeamSearchBatchConfig::beamRequestsInfo), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask, + &(beam_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); + } else if (batch_config->get_mode() == TREE_VERIFY_MODE) { TreeVerifyBatchConfig const *tree_batch_config = static_cast(batch_config); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->causalMask), - sizeof(BatchConfig::causalMask), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(BatchConfig::causalMask); - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(tree_batch_config->committed_tokens), - sizeof(TreeVerifyBatchConfig::committed_tokens), - cudaMemcpyHostToDevice, - stream)); - total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens); - - checkCUDA(cudaMemcpyAsync( - static_cast(handle.batch_config_metadata) + total_copy_size, - &(batch_config->request_completed), - sizeof(BatchConfig::request_completed), - cudaMemcpyHostToDevice, - stream)); - - total_copy_size += sizeof(BatchConfig::request_completed); + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask, + &(tree_batch_config->causalMask), + sizeof(BatchConfig::causalMask), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->committed_tokens, + &(tree_batch_config->committed_tokens), + sizeof(TreeVerifyBatchConfig::committed_tokens), + cudaMemcpyHostToDevice, + stream)); + + checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed, + &(batch_config->request_completed), + sizeof(BatchConfig::request_completed), + cudaMemcpyHostToDevice, + stream)); } - - // add a size check - assert(total_copy_size <= handle.batch_config_metadata_size); } void RequestManager::load_positions_task( diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp index 0daf151d2c..56931e0dc7 100644 --- a/src/runtime/simulator.cpp +++ b/src/runtime/simulator.cpp @@ -82,17 +82,17 @@ Simulator::Simulator(FFModel const *model, checkCUDA(hipEventCreate(&start_event)); checkCUDA(hipEventCreate(&end_event)); - conv2d_meta = new Conv2DMeta(handler); - // linear_meta = new LinearMeta(handler, 4096); - pool2d_meta = new Pool2DMeta(handler); - ele_unary_meta = new ElementUnaryMeta(handler); - // ele_binary_meta = new ElementBinaryMeta(handler); - // embedding_meta = new EmbeddingMeta(handler); - // softmax_meta = new SoftmaxMeta(handler); - batch_matmul_meta = new BatchMatmulMeta(handler); - concat_meta = new ConcatMeta(handler); - // dropout_meta = new DropoutMeta(handler); - transpose_meta = new TransposeMeta(handler); + // conv2d_meta = new Conv2DMeta(handler); + // linear_meta = new LinearMeta(handler, 4096); + // pool2d_meta = new Pool2DMeta(handler); + // ele_unary_meta = new ElementUnaryMeta(handler); + // ele_binary_meta = new ElementBinaryMeta(handler); + // embedding_meta = new EmbeddingMeta(handler); + // softmax_meta = new SoftmaxMeta(handler); + // batch_matmul_meta = new BatchMatmulMeta(handler); + // concat_meta = new ConcatMeta(handler); + // dropout_meta = new DropoutMeta(handler); + // transpose_meta = new TransposeMeta(handler); this->machine = machine; segment_size = model->config.simulator_segment_size; max_num_segments = model->config.simulator_max_num_segments; diff --git a/src/runtime/simulator.cu b/src/runtime/simulator.cu index b44ce1690a..056781f73d 100644 --- a/src/runtime/simulator.cu +++ b/src/runtime/simulator.cu @@ -81,17 +81,17 @@ Simulator::Simulator(FFModel const *model, cudaEventCreate(&start_event); cudaEventCreate(&end_event); - conv2d_meta = new Conv2DMeta(handler); + // conv2d_meta = new Conv2DMeta(handler); // linear_meta = new LinearMeta(handler, 4096); - pool2d_meta = new Pool2DMeta(handler); - ele_unary_meta = new ElementUnaryMeta(handler); + // pool2d_meta = new Pool2DMeta(handler); + // ele_unary_meta = new ElementUnaryMeta(handler); // ele_binary_meta = new ElementBinaryMeta(handler); // embedding_meta = new EmbeddingMeta(handler); // softmax_meta = new SoftmaxMeta(handler); - batch_matmul_meta = new BatchMatmulMeta(handler); - concat_meta = new ConcatMeta(handler); + // batch_matmul_meta = new BatchMatmulMeta(handler); + // concat_meta = new ConcatMeta(handler); // dropout_meta = new DropoutMeta(handler); - transpose_meta = new TransposeMeta(handler); + // transpose_meta = new TransposeMeta(handler); this->machine = machine; segment_size = model->config.simulator_segment_size; max_num_segments = model->config.simulator_max_num_segments; @@ -103,13 +103,13 @@ Simulator::~Simulator(void) { simulatorInst.destroy(); cudaEventDestroy(start_event); cudaEventDestroy(end_event); - delete conv2d_meta; - delete pool2d_meta; - delete ele_unary_meta; - delete batch_matmul_meta; - delete concat_meta; - delete transpose_meta; - delete task_manager; + // delete conv2d_meta; + // delete pool2d_meta; + // delete ele_unary_meta; + // delete batch_matmul_meta; + // delete concat_meta; + // delete transpose_meta; + // delete task_manager; } __host__ void diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index b86964049d..9b6510fe5e 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -43,6 +43,7 @@ #include "flexflow/parallel_ops/allreduce.h" #include "flexflow/parallel_ops/combine.h" #include "flexflow/parallel_ops/fused_parallel_op.h" +#include "flexflow/parallel_ops/parallel_identity.h" #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" @@ -3754,14 +3755,17 @@ bool FFModel::convert_graph_to_operators( assert(inList.size() == 1); Softmax *softmax = (Softmax *)node.ptr; new_op = new Softmax( - *this, softmax->layer_guid, inputs[0], softmax->dim, NULL); + *this, softmax->layer_guid, inputs[0], softmax->dim, softmax->name); break; } case OP_COMBINE: { assert(inList.size() == 1); Combine *combine = (Combine *)node.ptr; - new_op = new Combine( - *this, inputs[0], combine->combine_dim, combine->combine_degree); + new_op = new Combine(*this, + inputs[0], + combine->combine_dim, + combine->combine_degree, + combine->name); break; } case OP_REPARTITION: { @@ -3770,7 +3774,8 @@ bool FFModel::convert_graph_to_operators( new_op = new Repartition(*this, inputs[0], repart->repartition_dim, - repart->repartition_degree); + repart->repartition_degree, + repart->name); break; } case OP_REPLICATE: { @@ -3779,7 +3784,8 @@ bool FFModel::convert_graph_to_operators( new_op = new Replicate(*this, inputs[0], replicate->replicate_dim, - replicate->replicate_degree); + replicate->replicate_degree, + replicate->name); break; } case OP_REDUCTION: { @@ -3788,13 +3794,24 @@ bool FFModel::convert_graph_to_operators( new_op = new Reduction(*this, inputs[0], reduction->reduction_dim, - reduction->reduction_degree); + reduction->reduction_degree, + reduction->name); break; } case OP_ALLREDUCE: { assert(inList.size() == 1); AllReduce *allreduce = (AllReduce *)node.ptr; - new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim); + new_op = new AllReduce( + *this, inputs[0], allreduce->allreduce_dim, allreduce->name); + break; + } + case OP_PARALLEL_IDENTITY: { + assert(inList.size() == 1); + ParallelIdentity *parallel_identity = (ParallelIdentity *)node.ptr; + new_op = new ParallelIdentity(*this, + inputs[0], + parallel_identity->parallel_identity_dim, + parallel_identity->name); break; } case OP_FUSED_PARALLEL: { @@ -3819,8 +3836,9 @@ bool FFModel::convert_graph_to_operators( abr_ln->elementwise_affine, abr_ln->use_bias, abr_ln->eps, + abr_ln->inplace_residual, true, - NULL); + abr_ln->name); break; } case OP_SIGMOID_SILU_MULTI: { @@ -3828,7 +3846,7 @@ bool FFModel::convert_graph_to_operators( SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr; SigmoidSiluMultiParams params = ssm->get_params(); new_op = new SigmoidSiluMulti( - *this, ssm->layer_guid, inputs[0], inputs[1], NULL); + *this, ssm->layer_guid, inputs[0], inputs[1], ssm->name); break; } default: { diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index 49d42bb6dd..a71b1070b2 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -54,6 +54,10 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; os << " GUID: " << bc.requestsInfo[i].request_guid << std::endl; + // PEFT values + os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id + << std::endl; + os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; os << " Max sequence length: " << bc.requestsInfo[i].max_sequence_length << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index f3732d54f4..0000000000 --- a/tests/.gitignore +++ /dev/null @@ -1 +0,0 @@ -inference/python_test_configs/*.json diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh index 3fb361f25c..73b0cb30dc 100755 --- a/tests/align/test_all_operators.sh +++ b/tests/align/test_all_operators.sh @@ -11,7 +11,7 @@ function generate_torch_tensor(){ python tests/align/align_create_tensor_torch.py -o "$1" } -ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear gather) +ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear layernorm gather) #create flexflow tensors conda activate flexflow diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh index 1e8dd4298f..c7206eac93 100755 --- a/tests/cpp_gpu_tests.sh +++ b/tests/cpp_gpu_tests.sh @@ -23,8 +23,8 @@ remove_mnist() { download_mnist() { if [[ ! -f train-images-idx3-ubyte || ! -f train-labels-idx1-ubyte ]]; then remove_mnist - wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz - wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz + wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-images-idx3-ubyte.gz + wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-labels-idx1-ubyte.gz gzip -d train-images-idx3-ubyte.gz gzip -d train-labels-idx1-ubyte.gz fi diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index 8beea55999..a9dd8809ba 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -10,26 +10,26 @@ cd "${BASH_SOURCE[0]%/*}" ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 # OPT (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### @@ -37,63 +37,63 @@ fi ############################################################################################### # LLAMA (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1 # LLAMA (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 # LLAMA (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 # OPT (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 # OPT (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4 # OPT (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4 # OPT (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 # Falcon (full precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # Falcon (half precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (full precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (half precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4 # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # OPT (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 fi ############################################################################################### diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 6857b5cbc1..5e563c9974 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -77,20 +77,18 @@ def main(): # Set default tensor type depending on argument indicating the float type to use if not args.use_full_precision: - torch.set_default_tensor_type(torch.HalfTensor) - + torch.set_default_dtype(torch.float16) + else: + torch.set_default_dtype(torch.float32) + # Run huggingface model cuda_availble = torch.cuda.is_available() device = "cuda" if args.gpu and cuda_availble else "cpu" # Get Model - model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device) + model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=True).to(device) # Get Tokenizer hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True) - hf_arch = getattr(hf_config, "architectures")[0] - if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": - tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True) - else: - tokenizer = AutoTokenizer.from_pretrained(args.model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) generation_config = GenerationConfig.from_pretrained(args.model_name) generation_config.do_sample = args.do_sample ################# debugging ################# diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index a1ee281914..a83464754f 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -84,12 +84,13 @@ function compare_decoding_steps_spec_infer_incr_decoding { local specInf_file="$2" # Read the number of decoding steps from the second line of the files - second_line=$(sed -n '2p' "$incrDec_file") - read -r line <<< "$second_line" - incrDec=${line#*: } - second_line=$(sed -n '2p' "$specInf_file") - read -r line <<< "$second_line" - specInf=${line#*: } + first_line=$(sed -n '1p' "$incrDec_file") + incr_dec_steps="${first_line##*llm_decoding_steps(}" + incr_dec_steps="${incr_dec_steps%%)*}" + + first_line=$(sed -n '1p' "$specInf_file") + spec_inf_steps="${first_line##*llm_decoding_steps(}" + spec_inf_steps="${spec_inf_steps%%)*}" if ! command -v bc &> /dev/null; then echo "bc is not installed. Installing..." @@ -97,8 +98,8 @@ function compare_decoding_steps_spec_infer_incr_decoding { fi # Perform the comparison - threshold=$(bc <<< "$specInf * 1.5") - if (( $(echo "$incrDec >= $threshold" | bc -l) )); then + threshold=$(bc <<< "$spec_inf_steps * 1.5") + if (( $(echo "$incr_dec_steps >= $threshold" | bc -l) )); then #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file." : else @@ -184,13 +185,13 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p # Falcon (full precision) python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 -diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") -diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") -diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") -diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") -#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") -diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 41703cf431..0a745c7984 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -14,9 +14,12 @@ "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 4, "offload": False, - "offload_reserve_space_size": 1024**2, + "offload_reserve_space_size": 8 * 1024, # 8 GB "use_4bit_quantization": False, "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py new file mode 100644 index 0000000000..93727bdc89 --- /dev/null +++ b/tests/peft/alignment/align_test_utils.py @@ -0,0 +1,510 @@ +import os, re, torch +import numpy as np +from typing import List +from enum import Enum +from dataclasses import dataclass + +abs_dirname = os.path.dirname(os.path.abspath(__file__)) +cache_folder = os.path.expanduser(os.getenv("FF_CACHE_PATH", "~/.cache/flexflow")) +hf_path = os.path.join(cache_folder, "debug/huggingface") +ff_path = os.path.join(cache_folder, "debug/flexflow") + + +def print_unique_files_list(dirname): + files_list = os.listdir(dirname) + for f in sorted(files_list): + match = re.search(r"layers.\d+", f) + if match: + if "layers." in match[0]: + layer_num = int(match[0].split(".")[1]) + if layer_num > 0: + files_list.remove(f) + elif "layers_" in match[0]: + layer_num = int(match[0].split("_")[1]) + if layer_num > 0 and layer_num != 100: + files_list.remove(f) + return sorted(files_list) + + +def compare_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, tolerance=1e-2): + """Check whether a HuggingFace tensor and a FlexFlow tensor are equal + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor_filepath (str): The file path of the FlexFlow tensor + tolerance (float, optional): Floating-point error tolerance for the checks. Defaults to 1e-2. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(hf_tensor_filepath): + raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found") + if not os.path.exists(ff_tensor_filepath): + raise FileNotFoundError(f"FF tensor file {ff_tensor_filepath} not found") + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert len(hf_tensor) == 1 + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",") + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor = ff_tensor[:len_hf_tensor] + + mismatches = [] + if not np.allclose(ff_tensor, hf_tensor, atol=tolerance): + print(f"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}") + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(ff_tensor, hf_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0] + print(mismatches) + # print(np.nonzero(hf_tensor)[0]) + # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0]) + # print(ff_tensor[36], hf_tensor[36]) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len_hf_tensor + print("Ok!") + + +def compare_tensors_difference( + hf_tensor_filepath: str, + ff_tensor1_filepath: str, + ff_tensor2_filepath: str, + tolerance: float = 1e-2, +): + """Check whether a HuggingFace tensor is equal to the difference between two FlexFlow tensors + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor1_filepath (str): The file path of the first FlexFlow tensor + ff_tensor2_filepath (str): The file path of the second FlexFlow tensor + tolerance (float, optional): The floating-point error tolerance for the equality check. Defaults to 1e-2. + """ + assert os.path.exists(hf_tensor_filepath) + assert os.path.exists(ff_tensor1_filepath) + assert os.path.exists(ff_tensor2_filepath) + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert len(hf_tensor) == 1 + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=",") + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor1 = ff_tensor1[:len_hf_tensor] + ff_tensor2 = ff_tensor2[:len_hf_tensor] + ff_tensor = ff_tensor1 - ff_tensor2 + + mismatches = [] + if not np.allclose(ff_tensor, hf_tensor, atol=tolerance): + print( + f"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}" + ) + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(ff_tensor, hf_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0] + print(mismatches) + # print(np.nonzero(hf_tensor)[0]) + # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0]) + # print(ff_tensor[36], hf_tensor[36]) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len_hf_tensor + print("Ok!") + + +def compare_hf_tensors(tensor1_fp: str, tensor2_fp: str): + """Checks whether two HuggingFace tensors are equal + + Args: + tensor1_fp (str): The file path of the first tensor + tensor2_fp (str): The file path of the second tensor + """ + if not os.path.exists(tensor1_fp): + raise FileNotFoundError(f"HF tensor file: {tensor1_fp} not found") + if not os.path.exists(tensor2_fp): + raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found") + hf_tensor1 = torch.load(tensor1_fp) + hf_tensor2 = torch.load(tensor2_fp) + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert len(hf_tensor1) == 1 + hf_tensor1 = hf_tensor1[0] + if type(hf_tensor2) == tuple or type(hf_tensor2) == list: + assert len(hf_tensor2) == 1 + hf_tensor2 = hf_tensor2[0] + assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape + hf_tensor1 = torch.nan_to_num(hf_tensor1) + hf_tensor2 = torch.nan_to_num(hf_tensor2) + if not ( + np.allclose( + hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy() + ) + ): + print(f"mismatch between {tensor1_fp} and {tensor2_fp}") + print(hf_tensor1) + print(hf_tensor2) + print( + np.isclose( + hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy() + ) + ) + mismatches = np.where( + ~np.isclose( + hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy() + ) + )[0] + print(mismatches) + assert False + print("Ok!") + + +def check_hf_sum_tensors(tensor_sum_fp: str, tensor1_fp: str, tensor2_fp: str): + """Checks whether a HuggingFace tensor is equal to the sum of two other HuggingFace tensors + + Args: + tensor_sum_fp (str): The file path of the sum tensor + tensor1_fp (str): The file path of the first tensor + tensor2_fp (str): The file path of the second tensor + """ + if not os.path.exists(tensor_sum_fp): + raise FileNotFoundError(f"HF tensor file: {tensor_sum_fp} not found") + if not os.path.exists(tensor1_fp): + raise FileNotFoundError(f"HF tensor file {tensor1_fp} not found") + if not os.path.exists(tensor2_fp): + raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found") + hf_tensor_sum = torch.load(tensor_sum_fp) + hf_tensor1 = torch.load(tensor1_fp) + hf_tensor2 = torch.load(tensor2_fp) + if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list: + assert len(hf_tensor_sum) == 1 + hf_tensor_sum = hf_tensor_sum[0] + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert len(hf_tensor1) == 1 + hf_tensor1 = hf_tensor1[0] + if type(hf_tensor2) == tuple or type(hf_tensor2) == list: + assert len(hf_tensor2) == 1 + hf_tensor2 = hf_tensor2[0] + assert torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape + assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape + hf_tensor1 = torch.nan_to_num(hf_tensor1) + hf_tensor2 = torch.nan_to_num(hf_tensor2) + hf_tensor_sum = torch.nan_to_num(hf_tensor_sum) + sum_check_tensor = hf_tensor1 + hf_tensor2 + if not ( + np.allclose( + sum_check_tensor.detach().cpu().numpy(), + hf_tensor_sum.detach().cpu().numpy(), + ) + ): + print(f"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}") + print(tensor_sum_fp) + print(sum_check_tensor) + print(hf_tensor1) + print(hf_tensor2) + print( + np.isclose( + sum_check_tensor.detach().cpu().numpy(), + hf_tensor_sum.detach().cpu().numpy(), + ) + ) + mismatches = np.where( + ~np.isclose( + sum_check_tensor.detach().cpu().numpy(), + hf_tensor_sum.detach().cpu().numpy(), + ) + )[0] + print(mismatches) + assert False + print("Ok!") + + +def check_hf_zero_tensor(hf_tensor_fp: str): + """Check whether a HuggingFace tensor is a zero tensor + + Args: + hf_tensor_fp (str): The file path of the HuggingFace tensor + """ + if not os.path.exists(hf_tensor_fp): + raise FileNotFoundError(f"HF tensor file: {hf_tensor_fp} not found") + hf_tensor1 = torch.load(hf_tensor_fp) + if type(hf_tensor1) == tuple or type(hf_tensor1) == list: + assert len(hf_tensor1) == 1 + hf_tensor1 = hf_tensor1[0] + assert torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0 + + +def print_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, txt: str = ""): + """Print the contents of a HuggingFace tensor and a FlexFlow tensor + + Args: + hf_tensor_filepath (str): The file path of the HuggingFace tensor + ff_tensor_filepath (str): The file path of the FlexFlow tensor + txt (str, optional): Additional text to prepend to the tensors. Defaults to "". + """ + assert os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath) + hf_tensor = torch.load(hf_tensor_filepath) + if type(hf_tensor) == tuple or type(hf_tensor) == list: + assert len(hf_tensor) == 1 + hf_tensor = hf_tensor[0] + hf_tensor = torch.nan_to_num(hf_tensor) + hf_tensor = hf_tensor.flatten().detach().cpu().numpy() + ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",") + + len_hf_tensor = hf_tensor.shape[0] + ff_tensor = ff_tensor[:len_hf_tensor] + + print(f"{txt} - HF tensor:") + print(hf_tensor) + print(f"{txt} - FF tensor: ") + print(ff_tensor) + + +def compare_flexflow_tensors( + ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5, max_len: int = -1 +): + """Check whether two FlexFlow tensors are equal + + Args: + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (float, optional): Floating-point error tolernace for the check. Defaults to 1e-5. + max_len (int, optional): Maximum number of elements to check (if > 0). Defaults to -1. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") + assert os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp) + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",") + + if ff_tensor1.shape != ff_tensor2.shape: + print(ff_tensor1.shape, ff_tensor2.shape) + assert ff_tensor1.shape == ff_tensor2.shape + + if max_len > -1: + ff_tensor1 = ff_tensor1[:max_len] + ff_tensor2 = ff_tensor2[:max_len] + + mismatches = [] + if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance): + print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}") + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0] + print(mismatches) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len(ff_tensor1) + print("Ok!") + + +def compare_flexflow_tensors_shortest( + ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5 +): + """Compare two FlexFlow tensors up to the maximum length of the shortest tensor + + Args: + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (float, optional): Floating point error tolerance for the check. Defaults to 1e-5. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",") + minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0]) + ff_tensor1 = ff_tensor1[:minlen] + ff_tensor2 = ff_tensor2[:minlen] + mismatches = [] + if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance): + print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}") + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0] + print(mismatches) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len(ff_tensor1) + print("Ok!") + + +def check_flexflow_tensors_sum( + ff_tensor_sum_fp: str, ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance=1e-5 +): + """Check whether a FlexFlow tensor is equal to the sum of two other FlexFlow tensors + + Args: + ff_tensor_sum_fp (str): The file path of the FlexFlow sum tensor + ff_tensor1_fp (str): The file path of the first FlexFlow tensor + ff_tensor2_fp (str): The file path of the second FlexFlow tensor + tolerance (_type_, optional): Floating-point error tolerance for the check. Defaults to 1e-5. + + Raises: + FileNotFoundError: _description_ + FileNotFoundError: _description_ + """ + if not os.path.exists(ff_tensor1_fp): + raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found") + if not os.path.exists(ff_tensor2_fp): + raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found") + ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",") + ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",") + ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=",") + + ff_sum = ff_tensor1 + ff_tensor2 + assert ff_tensor1.shape == ff_tensor2.shape + + mismatches = [] + if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance): + print( + f"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}" + ) + print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}") + print(f"Sum Tensor: {ff_tensor_sum}\nActual sum:{ff_sum}") + print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance)) + mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0] + print(mismatches) + # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance)) + assert len(mismatches) <= 0.05 * len(ff_tensor1) + print("Ok!") + + +def load_ff_tensor(filename: str, shape: List[int]): + """Load a FlexFlow tensor from a file as a numpy array + + Args: + filename (str): The file path of the FF tensor + shape (List[int]): The shape of the FF tensor + + Returns: + _type_: The FF tensor as a numpy array + """ + if ff_path not in filename: + filename = os.path.join(ff_path, filename) + ff_tensor = np.loadtxt(filename, delimiter=",").reshape(shape, order="F") + return ff_tensor + + +def load_hf_tensor(filename: str): + """Load a HuggingFace tensor from a file as a numpy array + + Args: + filename (str): The file path of the HF tensor + + Returns: + _type_: The HF tensor as a numpy array + """ + if hf_path not in filename: + filename = os.path.join(hf_path, filename) + hf_tensor = torch.load(filename) + hf_tensor = hf_tensor.detach().cpu().numpy() + return hf_tensor + + +def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2): + """Check whether a Huggingface and a FlexFlow tensors, both loaded to memory in the form of a numpy array, are equal + + Args: + hf_tensor (_type_): The HuggingFace tensor (in numpy array form) + ff_tensor (_type_): The FlexFlow tensor (in numpy array form) + tolerance (_type_, optional): The floating point error tolerance for the check. Defaults to 1e-2. + """ + assert hf_tensor.shape == ff_tensor.shape + mismatches = [] + if not np.allclose(hf_tensor, ff_tensor, atol=tolerance): + print(f"mismatch between hf_tensor and ff_tensor") + print(f"HF: {hf_tensor}\nFF:{ff_tensor}") + print(np.isclose(hf_tensor, ff_tensor, atol=tolerance)) + mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] + print(mismatches) + len_hf_tensor = hf_tensor.flatten().shape[0] + assert len(mismatches) <= 0.05 * len_hf_tensor + print("Ok!") + + +def are_np_arrays_identical(*np_arrays): + if len(np_arrays) < 2: + return True + + first = np_arrays[0] + + # Check shapes and dtypes + if not all( + t.shape == first.shape and t.dtype == first.dtype for t in np_arrays[1:] + ): + return False + + # Stack all tensors along a new axis + stacked = np.stack(np_arrays) + + # Check if all elements along the new axis are equal + return np.all(stacked == stacked[0]) + + +class TPType(Enum): + REPLICATE = 0 + PARTITION = 1 + TO_REDUCE = 2 + + +@dataclass +class TensorComparisonIdxs: + hf_tensor_type: str + ff_tensor_type: str + hf_tensor_idx: int + ff_tensor_idx: int + + +def replace_value(lst, old_value, new_value): + occurrences = lst.count(old_value) + if occurrences == 0: + raise ValueError(f"Value {old_value} not found in the list.") + elif occurrences > 1: + raise ValueError(f"Multiple instances of {old_value} found in the list.") + else: + index = lst.index(old_value) + lst[index] = new_value + return lst + + +def truncate_dimension(tensor, old_dim, new_dim): + # Check if old_dim appears exactly once in the tensor's shape + shape = tensor.shape + dim_occurrences = shape.count(old_dim) + + if dim_occurrences == 0: + raise ValueError(f"Dimension {old_dim} not found in the tensor shape.") + elif dim_occurrences > 1: + raise ValueError( + f"Multiple instances of dimension {old_dim} found in the tensor shape." + ) + + # Check if new_dim is less than or equal to old_dim + if new_dim > old_dim: + raise ValueError( + f"New dimension ({new_dim}) must be less than or equal to old dimension ({old_dim})." + ) + + # Find the index of the dimension to truncate + dim_index = shape.index(old_dim) + + # Create a slice object for truncation + slices = [slice(None)] * len(shape) + slices[dim_index] = slice(0, new_dim) + + # Truncate the tensor + truncated_tensor = tensor[tuple(slices)] + + return truncated_tensor diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb new file mode 100644 index 0000000000..86a4ef76c4 --- /dev/null +++ b/tests/peft/alignment/llama_alignment_tests.ipynb @@ -0,0 +1,2651 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch\n", + "from align_test_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/FlexFlow/tests/peft/hf_peft_tensors /usr/FlexFlow/build/inference_tensors\n" + ] + } + ], + "source": [ + "print(hf_path, ff_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check weights (semi-automatically)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "def convert_hf_filename_to_ff_filename(f, num_layers=12):\n", + " if f.endswith(\".lm_head.weight\"):\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n", + " elif f == \"norm.weight\":\n", + " f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n", + " else:\n", + " f_version = \"fwd_step_0_\"\n", + " if f.startswith(\"layers.\"):\n", + " layernum = f.split(\"layers.\")[1].split(\".\")[0]\n", + " f_version += f\"layers_{layernum}_\"\n", + " f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n", + " weight_index=\"0\"\n", + " if \"lora_A\" in f_version:\n", + " weight_index=\"A\"\n", + " elif \"lora_B\" in f_version:\n", + " weight_index=\"B\"\n", + " f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n", + " f_version += f\"_shard_0_weight_{weight_index}\"\n", + " return f_version\n", + "\n", + "files_list = os.listdir(hf_path)\n", + "num_layers=12\n", + "for f in sorted(files_list):\n", + " if f.endswith(\".weight\"):\n", + " if \"self_attn\" in f:\n", + " continue\n", + " f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers)\n", + " # print(f, f_version)\n", + " hf_w_path = os.path.join(hf_path, f)\n", + " ff_w_path = os.path.join(ff_path, f_version)\n", + " assert(os.path.isfile(hf_w_path))\n", + " assert(os.path.isfile(ff_w_path))\n", + " # print(\"\\t\", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path))\n", + " # print(\"\\t\", ff_w_path)\n", + "\n", + " # check equivalence\n", + " compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load model for automatic check" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "from peft import PeftModel, PeftConfig\n", + "use_full_precision=True\n", + "peft_model_id=\"goliaro/llama-160m-lora\"\n", + "peft_config = PeftConfig.from_pretrained(peft_model_id)\n", + "if peft_config.peft_type != \"LORA\":\n", + " raise ValueError(f\"PEFT type {peft_config.peft_type} not supported yet\")\n", + "\n", + "peft_config.init_lora_weights = (\n", + " False\n", + ") # prevent HF from re-inizialing the weights randomly\n", + "model_name = peft_config.base_model_name_or_path\n", + "# Load base model, and apply the PEFT layer\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " torch_dtype=torch.float32 if use_full_precision else torch.float16,\n", + " device_map=\"auto\",\n", + ")\n", + "model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "embed_tokens True True\n", + "layers.0.self_attn.q_proj True True\n", + "layers.0.self_attn.k_proj True True\n", + "layers.0.self_attn.v_proj True True\n", + "layers.0.self_attn.o_proj True True\n", + "layers.0.self_attn.rotary_emb True True\n", + "layers.0.mlp.gate_proj True True\n", + "layers.0.mlp.up_proj True True\n", + "layers.0.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.0.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.0.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_output_0\n", + "layers.0.input_layernorm True True\n", + "layers.0.post_attention_layernorm True True\n", + "layers.1.self_attn.q_proj True True\n", + "layers.1.self_attn.k_proj True True\n", + "layers.1.self_attn.v_proj True True\n", + "layers.1.self_attn.o_proj True True\n", + "layers.1.self_attn.rotary_emb True True\n", + "layers.1.mlp.gate_proj True True\n", + "layers.1.mlp.up_proj True True\n", + "layers.1.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.1.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.1.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_output_0\n", + "layers.1.input_layernorm True True\n", + "layers.1.post_attention_layernorm True True\n", + "layers.2.self_attn.q_proj True True\n", + "layers.2.self_attn.k_proj True True\n", + "layers.2.self_attn.v_proj True True\n", + "layers.2.self_attn.o_proj True True\n", + "layers.2.self_attn.rotary_emb True True\n", + "layers.2.mlp.gate_proj True True\n", + "layers.2.mlp.up_proj True True\n", + "layers.2.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.2.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.2.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_output_0\n", + "layers.2.input_layernorm True True\n", + "layers.2.post_attention_layernorm True True\n", + "layers.3.self_attn.q_proj True True\n", + "layers.3.self_attn.k_proj True True\n", + "layers.3.self_attn.v_proj True True\n", + "layers.3.self_attn.o_proj True True\n", + "layers.3.self_attn.rotary_emb True True\n", + "layers.3.mlp.gate_proj True True\n", + "layers.3.mlp.up_proj True True\n", + "layers.3.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.3.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.3.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_output_0\n", + "layers.3.input_layernorm True True\n", + "layers.3.post_attention_layernorm True True\n", + "layers.4.self_attn.q_proj True True\n", + "layers.4.self_attn.k_proj True True\n", + "layers.4.self_attn.v_proj True True\n", + "layers.4.self_attn.o_proj True True\n", + "layers.4.self_attn.rotary_emb True True\n", + "layers.4.mlp.gate_proj True True\n", + "layers.4.mlp.up_proj True True\n", + "layers.4.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.4.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.4.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_output_0\n", + "layers.4.input_layernorm True True\n", + "layers.4.post_attention_layernorm True True\n", + "layers.5.self_attn.q_proj True True\n", + "layers.5.self_attn.k_proj True True\n", + "layers.5.self_attn.v_proj True True\n", + "layers.5.self_attn.o_proj True True\n", + "layers.5.self_attn.rotary_emb True True\n", + "layers.5.mlp.gate_proj True True\n", + "layers.5.mlp.up_proj True True\n", + "layers.5.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.5.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.5.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_output_0\n", + "layers.5.input_layernorm True True\n", + "layers.5.post_attention_layernorm True True\n", + "layers.6.self_attn.q_proj True True\n", + "layers.6.self_attn.k_proj True True\n", + "layers.6.self_attn.v_proj True True\n", + "layers.6.self_attn.o_proj True True\n", + "layers.6.self_attn.rotary_emb True True\n", + "layers.6.mlp.gate_proj True True\n", + "layers.6.mlp.up_proj True True\n", + "layers.6.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.6.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.6.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_output_0\n", + "layers.6.input_layernorm True True\n", + "layers.6.post_attention_layernorm True True\n", + "layers.7.self_attn.q_proj True True\n", + "layers.7.self_attn.k_proj True True\n", + "layers.7.self_attn.v_proj True True\n", + "layers.7.self_attn.o_proj True True\n", + "layers.7.self_attn.rotary_emb True True\n", + "layers.7.mlp.gate_proj True True\n", + "layers.7.mlp.up_proj True True\n", + "layers.7.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.7.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.7.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_output_0\n", + "layers.7.input_layernorm True True\n", + "layers.7.post_attention_layernorm True True\n", + "layers.8.self_attn.q_proj True True\n", + "layers.8.self_attn.k_proj True True\n", + "layers.8.self_attn.v_proj True True\n", + "layers.8.self_attn.o_proj True True\n", + "layers.8.self_attn.rotary_emb True True\n", + "layers.8.mlp.gate_proj True True\n", + "layers.8.mlp.up_proj True True\n", + "layers.8.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.8.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.8.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_output_0\n", + "layers.8.input_layernorm True True\n", + "layers.8.post_attention_layernorm True True\n", + "layers.9.self_attn.q_proj True True\n", + "layers.9.self_attn.k_proj True True\n", + "layers.9.self_attn.v_proj True True\n", + "layers.9.self_attn.o_proj True True\n", + "layers.9.self_attn.rotary_emb True True\n", + "layers.9.mlp.gate_proj True True\n", + "layers.9.mlp.up_proj True True\n", + "layers.9.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.9.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.9.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_output_0\n", + "layers.9.input_layernorm True True\n", + "layers.9.post_attention_layernorm True True\n", + "layers.10.self_attn.q_proj True True\n", + "layers.10.self_attn.k_proj True True\n", + "layers.10.self_attn.v_proj True True\n", + "layers.10.self_attn.o_proj True True\n", + "layers.10.self_attn.rotary_emb True True\n", + "layers.10.mlp.gate_proj True True\n", + "layers.10.mlp.up_proj True True\n", + "layers.10.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.10.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.10.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_output_0\n", + "layers.10.input_layernorm True True\n", + "layers.10.post_attention_layernorm True True\n", + "layers.11.self_attn.q_proj True True\n", + "layers.11.self_attn.k_proj True True\n", + "layers.11.self_attn.v_proj True True\n", + "layers.11.self_attn.o_proj True True\n", + "layers.11.self_attn.rotary_emb True True\n", + "layers.11.mlp.gate_proj True True\n", + "layers.11.mlp.up_proj True True\n", + "layers.11.mlp.down_proj.base_layer True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_dropout.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_A.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_B.default True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_embedding_A False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_output_0\n", + "layers.11.mlp.down_proj.lora_embedding_B False False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_output_0\n", + "layers.11.mlp.act_fn True False\n", + "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_output_0\n", + "layers.11.input_layernorm True True\n", + "layers.11.post_attention_layernorm True True\n", + "norm True True\n", + "lm_head True True\n" + ] + } + ], + "source": [ + "named_modules_ = [\n", + " name.replace(\"base_model.model.model.\", \"\").replace(\"base_model.model.model\", \"\").replace(\"base_model.model.\", \"\").replace(\"base_model.model\", \"\").replace(\"base_model.\", \"\").replace(\"base_model\", \"\")\n", + " for name, _ in model.named_modules()\n", + "]\n", + "\n", + "def remove_prefixes(named_modules):\n", + " i = 0\n", + " while i < len(named_modules) - 1:\n", + " if named_modules[i + 1].startswith(named_modules[i]):\n", + " named_modules.pop(i)\n", + " else:\n", + " i += 1\n", + " return named_modules\n", + "named_modules = remove_prefixes(named_modules_)\n", + "\n", + "def convert_hf_module_name_to_ff_filenames(n, num_layers=12):\n", + " if n == \"embed_tokens\":\n", + " ff_in_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n", + " ff_out_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n", + " elif n == \"lm_head\" or n == \"norm\":\n", + " ff_in_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_input_0\"\n", + " ff_out_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_output_0\"\n", + " elif n.startswith(\"layers.\"):\n", + " layernum = n.split(\"layers.\")[1].split(\".\")[0]\n", + " ff_in_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_input_0\"\n", + " ff_out_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_output_0\"\n", + " else:\n", + " assert False, f\"Module {n} not supported yet\"\n", + " return os.path.join(ff_path, ff_in_name), os.path.join(ff_path, ff_out_name)\n", + "\n", + "# Compute the hf path, check if the input and output are there\n", + "for n in named_modules:\n", + " in_name = f\"fwd_step_0_{n}.input_0\"\n", + " out_name = f\"fwd_step_0_{n}.output_0\"\n", + " if n == \"lm_head\":\n", + " in_name = f\"fwd_step_0_base_model.model.{n}.input_0\"\n", + " out_name = f\"fwd_step_0_base_model.model.{n}.output_0\"\n", + " hf_mod_in = os.path.join(hf_path, in_name)\n", + " hf_mod_out = os.path.join(hf_path, out_name)\n", + " check = os.path.exists(hf_mod_in) and os.path.exists(hf_mod_out)\n", + " \n", + " check2=True\n", + " if \"self_attn\" not in n:\n", + " ff_mod_in, ff_mod_out = convert_hf_module_name_to_ff_filenames(n, num_layers=num_layers)\n", + " check2 = os.path.exists(ff_mod_in) and os.path.exists(ff_mod_out)\n", + " print(n, check, check2)\n", + " if not check2:\n", + " print(\"\\t\", ff_mod_in, ff_mod_out)\n", + " # print(n, check)\n", + " # print(\"\\t\", )\n", + " \n", + "\n", + "# Compute the corresponding ff path, check if the input and output are there\n", + "\n", + "# for x in named_modules:\n", + "# print(x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'down_proj'}\n" + ] + } + ], + "source": [ + "print(model.peft_config['default'].target_modules)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manual check" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "hf_embed_input= \"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.input_0\"\n", + "ff_embed_input=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n", + "compare_tensors(hf_embed_input, ff_embed_input)\n", + "hf_embed_output=\"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.output_0\"\n", + "ff_embed_output=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n", + "compare_tensors(hf_embed_output, ff_embed_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.10.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.input_layernorm_shard_0_output_0\n", + "HF: [ 0. 0. 0. ... 0.06630182 6.3429456\n", + " -0.21220279]\n", + "FF:[ 0. 0. 0. ... 0.06630275 6.34293985\n", + " -0.21219885]\n", + "[ True True True ... True True True]\n", + "[15889]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.input_layernorm_shard_0_output_0\n", + "HF: [ 0. 0. 0. ... 0.14172177 9.79423\n", + " -6.2940273 ]\n", + "FF:[ 0. 0. 0. ... 0.14172006 9.79421902\n", + " -6.29402065]\n", + "[ True True True ... True True True]\n", + "[ 2878 3206 3367 3607 5183 5346 6257 6544 7466 7679 7805 8119\n", + " 8159 8911 9450 9897 13696 13938 14058 14599 15126 15839 16128 16195]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for i in range(tot_num_layers):\n", + " hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n", + " ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n", + " if i > 0:\n", + " ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n", + " compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5)\n", + " hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n", + " if i > 0:\n", + " ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5)\n", + " hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n", + " ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n", + " compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5)\n", + " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n", + " compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5)\n", + " # w1\n", + " hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", + " ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n", + " compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5)\n", + " # w3\n", + " hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n", + " ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0\"\n", + " compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5)\n", + " # w2\n", + " hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n", + " hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n", + " ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n", + " ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0\"\n", + " compare_tensors(hf_down_proj_in, ff_down_proj_in)\n", + " # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5)\n", + "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- LM head --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Final Norm --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "# ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "print(\"-- LM head --\")\n", + "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "# compare weights\n", + "hf_lm_head_weight = f\"{hf_path}/base_model.model.lm_head.weight\"\n", + "ff_lm_head_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0\"\n", + "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "# # Manually check the matmul\n", + "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n", + "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n", + "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n", + "# print(ff_tensor_out.shape)\n", + "# print(ff_weight.shape)\n", + "# print(np.matmul(ff_weight, ff_tensor_out))\n", + "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n", + "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n", + "print(\"-- Final Norm --\")\n", + "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "ff_BWD_norm_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0\"\n", + "hf_FWD_norm_weight = f\"{hf_path}/norm.weight\"\n", + "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n", + "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "class LlamaRotaryEmbedding(nn.Module):\n", + " def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n", + " super().__init__()\n", + "\n", + " self.dim = dim\n", + " self.max_position_embeddings = max_position_embeddings\n", + " self.base = base\n", + " inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n", + " self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n", + "\n", + " # Build here to make `torch.jit.trace` work.\n", + " self._set_cos_sin_cache(\n", + " seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n", + " )\n", + "\n", + " def _set_cos_sin_cache(self, seq_len, device, dtype):\n", + " self.max_seq_len_cached = seq_len\n", + " t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n", + "\n", + " freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n", + " # Different from paper, but it uses a different permutation in order to obtain the same calculation\n", + " emb = torch.cat((freqs, freqs), dim=-1)\n", + " self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n", + " self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n", + "\n", + " def forward(self, x, seq_len=None):\n", + " # x: [bs, num_attention_heads, seq_len, head_size]\n", + " if seq_len > self.max_seq_len_cached:\n", + " self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n", + "\n", + " return (\n", + " self.cos_cached[:seq_len].to(dtype=x.dtype),\n", + " self.sin_cached[:seq_len].to(dtype=x.dtype),\n", + " )\n", + "def rotate_half(x):\n", + " \"\"\"Rotates half the hidden dims of the input.\"\"\"\n", + " x1 = x[..., : x.shape[-1] // 2] # first half\n", + " x2 = x[..., x.shape[-1] // 2 :] # second half\n", + " return torch.cat((x2, -x1), dim=-1)\n", + "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n", + " \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n", + "\n", + " Args:\n", + " q (`torch.Tensor`): The query tensor.\n", + " k (`torch.Tensor`): The key tensor.\n", + " cos (`torch.Tensor`): The cosine part of the rotary embedding.\n", + " sin (`torch.Tensor`): The sine part of the rotary embedding.\n", + " position_ids (`torch.Tensor`):\n", + " The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n", + " used to pass offsetted position ids when working with a KV-cache.\n", + " unsqueeze_dim (`int`, *optional*, defaults to 1):\n", + " The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n", + " sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n", + " that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n", + " k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n", + " cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n", + " the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n", + " Returns:\n", + " `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n", + " \"\"\"\n", + " cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n", + " sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n", + " q_embed = (q * cos) + (rotate_half(q) * sin)\n", + " k_embed = (k * cos) + (rotate_half(k) * sin)\n", + " return q_embed, k_embed\n", + "head_dim = 64\n", + "max_position_embeddings = 2048\n", + "rope_theta=10_000\n", + "kv_seq_len = 24\n", + "rotary_emb = LlamaRotaryEmbedding(\n", + " head_dim,\n", + " max_position_embeddings=max_position_embeddings,\n", + " base=rope_theta,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43525000e+03 -6.48986062e+05 1.17611250e+05 ... 2.14103413e+01\n", + " 1.20965385e+01 3.64246368e+00]\n", + "[False True True ... True True True]\n", + "[ 0 162 185 308 339 745 747 820 830 909 933 968 1008 1156\n", + " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n", + " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n", + " 2888 2894 2937 3024]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_feed_forward_w2_shard_0_input_0\n", + "HF: [ 6.4350547e+03 -6.4898600e+05 1.1761116e+05 ... 2.1410337e+01\n", + " 1.2096541e+01 3.6424692e+00]\n", + "FF:[ 6.43525000e+03 -6.48986062e+05 1.17611250e+05 ... 2.14103413e+01\n", + " 1.20965385e+01 3.64246368e+00]\n", + "[False True True ... True True True]\n", + "[ 0 162 185 308 339 745 747 820 830 909 933 968 1008 1156\n", + " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n", + " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n", + " 2888 2894 2937 3024]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_attention_shard_0_o_proj_in_grad\n", + "HF: [ 1.2223595e+06 -2.6348565e+06 -5.0760525e+05 ... 6.8275871e+01\n", + " -5.8116108e+01 9.5347488e+01]\n", + "FF:[ 1.22235925e+06 -2.63485625e+06 -5.07605000e+05 ... 6.82758865e+01\n", + " -5.81161423e+01 9.53475494e+01]\n", + "[ True True True ... True True True]\n", + "[ 51 77 95 168 175 232 725]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 1.22235950e+06 9.93645859e+01 -2.82157593e+01 ... -3.94578514e+01\n", + " -1.98409653e+01 -1.33438044e+01]\n", + " [-2.63485650e+06 -1.13461929e+02 1.14223976e+02 ... 7.52578735e+01\n", + " 1.33362747e+02 6.78501587e+01]\n", + " [-5.07605250e+05 4.34111862e+01 8.10619354e+01 ... 4.70537224e+01\n", + " 4.02149696e+01 6.98045502e+01]\n", + " ...\n", + " [ 3.02792250e+06 3.31295319e+02 9.98417091e+00 ... 4.90895653e+01\n", + " 9.71413574e+01 6.82758713e+01]\n", + " [-3.64456375e+06 -2.43692596e+02 -6.85474396e+00 ... -3.71503868e+01\n", + " -1.34136658e+01 -5.81161079e+01]\n", + " [ 3.31921500e+06 2.24193970e+02 -6.64005566e+00 ... 2.11662292e+00\n", + " 3.37400856e+01 9.53474884e+01]]\n", + "FF:[[ 1.22235925e+06 9.93645630e+01 -2.82157211e+01 ... -3.94577713e+01\n", + " -1.98408775e+01 -1.33438234e+01]\n", + " [-2.63485625e+06 -1.13461960e+02 1.14224037e+02 ... 7.52577744e+01\n", + " 1.33362701e+02 6.78501205e+01]\n", + " [-5.07605000e+05 4.34111404e+01 8.10619278e+01 ... 4.70536804e+01\n", + " 4.02149124e+01 6.98045578e+01]\n", + " ...\n", + " [ 3.02792250e+06 3.31295227e+02 9.98412323e+00 ... 4.90895386e+01\n", + " 9.71413727e+01 6.82758865e+01]\n", + " [-3.64456400e+06 -2.43692627e+02 -6.85472488e+00 ... -3.71504822e+01\n", + " -1.34137001e+01 -5.81161423e+01]\n", + " [ 3.31921500e+06 2.24193970e+02 -6.64004517e+00 ... 2.11670875e+00\n", + " 3.37400322e+01 9.53475494e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 51 77 95 168 175 232 725]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 1.2223588e+06 -2.6348530e+06 -5.0760291e+05 ... 3.0279325e+06\n", + " -3.6445672e+06 3.3192180e+06]\n", + " [-4.2496326e+02 1.1576636e+03 9.8397858e+02 ... 1.6480791e+03\n", + " -5.9697235e+02 6.2627173e+02]\n", + " [-2.2012039e+01 6.6097900e+01 3.9933994e+01 ... 5.7103355e+01\n", + " -1.5968766e+01 3.6536639e+00]\n", + " ...\n", + " [-1.2302110e+00 5.3052688e+00 2.1982718e+00 ... 1.3990868e+00\n", + " -5.5132383e-01 4.8985812e-01]\n", + " [-1.0771493e+00 6.9571300e+00 2.7373023e+00 ... 4.9663010e+00\n", + " -9.9705428e-01 2.1829298e+00]\n", + " [-5.9534687e-01 3.0272012e+00 3.1143982e+00 ... 2.4072502e+00\n", + " -2.0490403e+00 3.3617332e+00]]\n", + "FF:[[ 1.22235850e+06 -2.63485275e+06 -5.07602656e+05 ... 3.02793250e+06\n", + " -3.64456750e+06 3.31921800e+06]\n", + " [-4.24962585e+02 1.15766296e+03 9.83978577e+02 ... 1.64807898e+03\n", + " -5.96972351e+02 6.26271790e+02]\n", + " [-2.20120354e+01 6.60979462e+01 3.99340210e+01 ... 5.71033745e+01\n", + " -1.59687757e+01 3.65366316e+00]\n", + " ...\n", + " [-1.23020661e+00 5.30526114e+00 2.19826817e+00 ... 1.39908671e+00\n", + " -5.51325083e-01 4.89858717e-01]\n", + " [-1.07714510e+00 6.95712519e+00 2.73729825e+00 ... 4.96630049e+00\n", + " -9.97055829e-01 2.18292713e+00]\n", + " [-5.95347941e-01 3.02720070e+00 3.11439991e+00 ... 2.40725493e+00\n", + " -2.04904509e+00 3.36174107e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0]\n", + "Ok!\n", + "7.4363425925925934% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n", + " 2.38160934e+01 3.15938339e+01]\n", + " [-9.55138900e+06 6.71377197e+02 2.06871887e+02 ... -3.86393509e+01\n", + " 2.14816055e+01 -6.58599396e+01]\n", + " [ 1.14522670e+07 2.19898975e+03 -6.89673233e+00 ... 9.51593590e+00\n", + " -1.68612709e+01 6.02474251e+01]\n", + " ...\n", + " [ 2.10891925e+06 3.78648706e+03 1.02701221e+03 ... 3.59794388e+01\n", + " 5.03902206e+01 4.19777756e+01]\n", + " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ... 9.36443710e+00\n", + " 3.84094887e+01 -7.51948738e+00]\n", + " [ 7.39155050e+06 1.11731885e+03 3.38369843e+02 ... 3.70399475e+01\n", + " 1.77629051e+01 9.76780853e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.52523600e+06 -1.27625293e+03 -4.39336700e+01 ... -3.34414597e+01\n", + " 2.38162422e+01 3.15938187e+01]\n", + " [-9.55138900e+06 6.71377319e+02 2.06871674e+02 ... -3.86393127e+01\n", + " 2.14817867e+01 -6.58600464e+01]\n", + " [ 1.14522660e+07 2.19898950e+03 -6.89660644e+00 ... 9.51594448e+00\n", + " -1.68611774e+01 6.02474518e+01]\n", + " ...\n", + " [ 2.10891850e+06 3.78648633e+03 1.02701196e+03 ... 3.59794846e+01\n", + " 5.03901253e+01 4.19777679e+01]\n", + " [ 2.11695400e+06 -2.36282440e+02 -1.08002762e+02 ... 9.36448860e+00\n", + " 3.84096107e+01 -7.51954842e+00]\n", + " [ 7.39155000e+06 1.11731921e+03 3.38370087e+02 ... 3.70398293e+01\n", + " 1.77627277e+01 9.76782227e+01]]\n", + "6.011284722222222% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 4.9057606e+01\n", + " 4.7770348e+01 5.8564331e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 4.90575981e+01\n", + " 4.77703362e+01 5.85643845e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750 900 1198 1249\n", + " 1287 1305 1414 1428 1490 1588 1600 1612 1625 1657 1676 1677\n", + " 1692 1694 1724 1730 1772 1822 1825 1838 1853 1910 2035 2043\n", + " 2053 2059 2073 2078 2123 2145 2214 2238 2241 2285 2292 2389\n", + " 2542 2582 2589 2599 2674 2688 2711 2840 2856 2961 2963 2980\n", + " 3064 3176 3192 3255 3262 3278 3338 3341 3412 3419 3492 3590\n", + " 3624 3646 3657 3807 3840 3842 3846 3883 3887 4005 4049 4071\n", + " 4076 4077 4079 4137 4142 4192 4193 4202 4218 4224 4273 4355\n", + " 4358 4381 4401 4435 4469 4499 4514 4546 4598 4619 4747 4846\n", + " 4872 4916 4952 4966 5016 5067 5107 5112 5116 5194 5225 5350\n", + " 5364 5403 5515 5537 5550 5578 5650 5653 5654 5736 5751 5837\n", + " 5870 5881 5972 5998 6006 6051 6061 6107 6129 6204 6236 6292\n", + " 6296 6327 6382 6393 6403 6420 6424 6436 6468 6542 6599 6675\n", + " 6681 6711 6723 6767 6823 6914 6983 7047 7064 7133 7167 7197\n", + " 7198 7209 7528 7537 7538 7686 7850 7855 7889 7910 7919 7927\n", + " 7937 7939 8089 8101 8157 8169 8175 8223 8292 8304 8306 8342\n", + " 8351 8414 8475 8500 8543 8558 8609 8656 8687 8704 8724 8726\n", + " 8777 8816 8826 8871 8904 8934 8983 9012 9033 9043 9068 9093\n", + " 9125 9133 9144 9151 9154 9217 9222 9320 9335 9367 9398 9421\n", + " 9434 9521 9547 9633 9702 9726 9763 9949 10018 10053 10062 10079\n", + " 10137 10149 10203 10261 10269 10292 10312 10332 10471 10478 10514 10596\n", + " 10645 10676 10678 10781 10795 10810 10833 10891 10904 10935 10957 10977\n", + " 10982 11028 11095 11172 11223 11251 11283 11303 11319 11374 11392 11437\n", + " 11486 11627 11678 11750 11759 11979 11996 12019 12126 12237 12262 12288\n", + " 12303 12309 12315 12387 12543 12569 12613 12648 12786 12852 12866 12879\n", + " 12947 12963 13037 13058 13261 13284 13312 13394 13399 13427 13526 13527\n", + " 13592 13695 13741 13752 13775 13803 13812 13866 13902 14049 14170 14241\n", + " 14354 14382 14426 14451 14455 14486 14502 14582 14820 14934 14961 14976\n", + " 15000 15003 15014 15077 15096 15108 15135 15148 15165 15219 15232 15290\n", + " 15339 15345 15819 15945 15994 16077 16135 16218 16231 16233 16239 16243\n", + " 16295 16311 16339 16356 16366 16417 16456 16498 16502 16503 16506 16547\n", + " 16585 16603 16611 16633 16661 16683 16704 16710 16723 16724 16745 16754\n", + " 16773 16787 16789 16818 16829 16833 16913 16933 17025 17033 17037 17055\n", + " 17084 17098 17109 17176 17225 17240 17292 17294 17339 17390 17427 17437\n", + " 17579 17626 17630 17654 17719 17902 17912 18023 18025 18124 18203 18339\n", + " 18344]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 4.9057606e+01\n", + " 4.7770348e+01 5.8564331e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 4.90575981e+01\n", + " 4.77703362e+01 5.85643845e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 4.7819588e+07 3.8833264e+07 4.7789860e+07 ... 1.0804405e+00\n", + " 2.7186510e-01 -2.9918199e+00]\n", + "FF:[ 4.78195960e+07 3.88332640e+07 4.77898600e+07 ... 1.08044124e+00\n", + " 2.71864563e-01 -2.99182224e+00]\n", + "[ True True True ... True True True]\n", + "[ 109 211 312 422 590 832 835 1016 1053 1076 1268 1353 1374 1693\n", + " 1701 1710 1722 1832 1954 1965 1997 2076 2124 2146 2378 2520 2605 2624\n", + " 2967 3007 3015]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 3.3558659e+09 1.3409817e+10 -1.4671958e+10 ... 7.2100967e+01\n", + " 6.5979071e+00 -2.1230124e+01]\n", + "FF:[ 3.35586406e+09 1.34098166e+10 -1.46719611e+10 ... 7.21009750e+01\n", + " 6.59790993e+00 -2.12301121e+01]\n", + "[ True True True ... True True True]\n", + "[ 4 95 111 163 179 191 279 305 363 406 447 487 489 494\n", + " 517 617 703 713 735 796 805 819 826 858 882 959 964 967\n", + " 986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n", + " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n", + " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n", + " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_input_0\n", + "HF: [ 3.3558659e+09 1.3409817e+10 -1.4671958e+10 ... 7.2100967e+01\n", + " 6.5979071e+00 -2.1230124e+01]\n", + "FF:[ 3.35586406e+09 1.34098166e+10 -1.46719611e+10 ... 7.21009750e+01\n", + " 6.59790993e+00 -2.12301121e+01]\n", + "[ True True True ... True True True]\n", + "[ 4 95 111 163 179 191 279 305 363 406 447 487 489 494\n", + " 517 617 703 713 735 796 805 819 826 858 882 959 964 967\n", + " 986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n", + " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n", + " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n", + " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_output_0\n", + "HF: [-9.4779546e+09 -1.2174155e+10 1.4899113e+10 ... 9.3464905e+01\n", + " 7.5613129e+01 7.6598846e+01]\n", + "FF:[-9.47795558e+09 -1.21741548e+10 1.48991119e+10 ... 9.34649200e+01\n", + " 7.56131058e+01 7.65989227e+01]\n", + "[ True True True ... True True True]\n", + "[ 88 138 187 203 232 242 493 657 750]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_o_proj_in_grad\n", + "HF: [-9.4470595e+09 -7.3870331e+09 1.2659395e+10 ... -2.8149616e+01\n", + " 1.7019112e+02 -7.7236428e+00]\n", + "FF:[-9.44706150e+09 -7.38703309e+09 1.26593966e+10 ... -2.81496239e+01\n", + " 1.70191177e+02 -7.72364044e+00]\n", + "[ True True True ... True True True]\n", + "[ 11 98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-9.44705946e+09 2.28078384e+01 3.18554016e+02 ... 1.17267204e+02\n", + " 2.06791725e+01 1.13138672e+02]\n", + " [-7.38703309e+09 -7.36898804e+00 7.93705673e+01 ... 2.04039650e+01\n", + " 3.18331490e+01 5.44241562e+01]\n", + " [ 1.26593946e+10 1.77534424e+02 -2.97175941e+01 ... 1.16716766e+01\n", + " 7.70214081e+01 2.81902496e+02]\n", + " ...\n", + " [ 4.51210445e+10 3.63867615e+02 -8.04915466e+01 ... -1.34332123e+02\n", + " -1.22151840e+02 -2.81496162e+01]\n", + " [-1.39591885e+10 1.59216873e+02 6.11343079e+01 ... 1.56675262e+02\n", + " 9.68551483e+01 1.70191116e+02]\n", + " [-1.29442345e+10 -2.39441833e+02 2.73647644e+02 ... -4.41197014e+01\n", + " -9.48526230e+01 -7.72364283e+00]]\n", + "FF:[[-9.44706150e+09 2.28079376e+01 3.18553864e+02 ... 1.17267227e+02\n", + " 2.06791859e+01 1.13138741e+02]\n", + " [-7.38703309e+09 -7.36921692e+00 7.93703690e+01 ... 2.04038925e+01\n", + " 3.18332825e+01 5.44241333e+01]\n", + " [ 1.26593966e+10 1.77534454e+02 -2.97174206e+01 ... 1.16717224e+01\n", + " 7.70213699e+01 2.81902618e+02]\n", + " ...\n", + " [ 4.51210527e+10 3.63867554e+02 -8.04915695e+01 ... -1.34332092e+02\n", + " -1.22151901e+02 -2.81496239e+01]\n", + " [-1.39591834e+10 1.59216995e+02 6.11343040e+01 ... 1.56675293e+02\n", + " 9.68551559e+01 1.70191177e+02]\n", + " [-1.29442304e+10 -2.39441772e+02 2.73647644e+02 ... -4.41196594e+01\n", + " -9.48526916e+01 -7.72364044e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 11 98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-9.44705946e+09 -7.38703309e+09 1.26593946e+10 ... 4.51210445e+10\n", + " -1.39591885e+10 -1.29442345e+10]\n", + " [ 1.14852783e+03 4.39543152e+02 1.07877356e+03 ... -2.42416113e+03\n", + " 2.64504834e+03 4.68633453e+02]\n", + " [ 5.72417107e+01 4.12602806e+01 -2.27319489e+01 ... -3.40788422e+01\n", + " 4.86237946e+01 1.25752163e+01]\n", + " ...\n", + " [ 6.76848269e+00 8.23165894e+00 2.10253639e+01 ... -3.19590777e-01\n", + " 3.68098617e-01 -1.95310101e-01]\n", + " [ 4.08574820e+00 5.33035660e+00 1.41003275e+01 ... -1.35607815e+00\n", + " 4.06074905e+00 -7.67630756e-01]\n", + " [ 2.03186665e+01 9.77407932e+00 5.06271019e+01 ... -6.80029154e-01\n", + " 4.11142111e+00 -1.86585218e-01]]\n", + "FF:[[-9.44706150e+09 -7.38703309e+09 1.26593966e+10 ... 4.51210527e+10\n", + " -1.39591834e+10 -1.29442304e+10]\n", + " [ 1.14852808e+03 4.39542755e+02 1.07877344e+03 ... -2.42416138e+03\n", + " 2.64504932e+03 4.68633698e+02]\n", + " [ 5.72415771e+01 4.12602005e+01 -2.27318707e+01 ... -3.40787392e+01\n", + " 4.86236725e+01 1.25752039e+01]\n", + " ...\n", + " [ 6.76847696e+00 8.23167515e+00 2.10253181e+01 ... -3.19590837e-01\n", + " 3.68098557e-01 -1.95310280e-01]\n", + " [ 4.08574867e+00 5.33037567e+00 1.41003180e+01 ... -1.35607564e+00\n", + " 4.06074095e+00 -7.67629445e-01]\n", + " [ 2.03186874e+01 9.77407932e+00 5.06271439e+01 ... -6.80029511e-01\n", + " 4.11142349e+00 -1.86585203e-01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "6.640625% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-5.1505955e+10 -4.7166772e+03 -1.3288132e+02 ... -3.0123844e+00\n", + " -5.5234032e+01 6.0299168e+00]\n", + " [-3.5960029e+10 -5.3263096e+03 -1.9434322e+02 ... -5.6601189e+01\n", + " -1.0787462e+02 -6.0718418e+01]\n", + " [ 4.8131662e+10 1.1578307e+04 1.7744476e+02 ... -5.6970375e+01\n", + " -1.7497168e+01 -7.2297249e+00]\n", + " ...\n", + " [-9.0346426e+08 6.4752144e+03 3.2408417e+02 ... 6.1075470e+01\n", + " 8.5356834e+01 8.3221588e+01]\n", + " [-5.0754217e+09 -2.2929268e+03 -1.4913528e+02 ... 8.6639397e+01\n", + " 1.1156468e+02 1.0695674e+02]\n", + " [ 5.5844772e+09 3.0225920e+03 -6.3137859e+01 ... -6.5270996e+01\n", + " 8.2730171e+01 -1.0107367e+02]]\n", + "ff_attn_in: (768, 24)\n", + "[[-5.15059548e+10 -4.71667773e+03 -1.32881012e+02 ... -3.01225996e+00\n", + " -5.52339973e+01 6.02991867e+00]\n", + " [-3.59600292e+10 -5.32630957e+03 -1.94343079e+02 ... -5.66010437e+01\n", + " -1.07874649e+02 -6.07182846e+01]\n", + " [ 4.81316659e+10 1.15783076e+04 1.77444519e+02 ... -5.69703102e+01\n", + " -1.74972763e+01 -7.22990799e+00]\n", + " ...\n", + " [-9.03455232e+08 6.47521484e+03 3.24083832e+02 ... 6.10753632e+01\n", + " 8.53567886e+01 8.32217255e+01]\n", + " [-5.07543654e+09 -2.29292749e+03 -1.49135025e+02 ... 8.66392517e+01\n", + " 1.11564789e+02 1.06956917e+02]\n", + " [ 5.58446592e+09 3.02259229e+03 -6.31376152e+01 ... -6.52709351e+01\n", + " 8.27302551e+01 -1.01073837e+02]]\n", + "7.025824652777778% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_output_0\n", + "HF: [-6.33203254e+13 -4.43651289e+13 6.35509366e+13 ... 1.08435585e+02\n", + " 9.42303467e+01 5.89958420e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 1.08435623e+02\n", + " 9.42303467e+01 5.89958954e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415\n", + " 428 482 492 514 526 531 671 731 763 777 893 927\n", + " 984 1105 1184 1206 1418 1541 1548 1572 1577 1613 1619 1643\n", + " 1658 1661 1691 1701 1706 1726 1757 1784 1815 1833 1849 1856\n", + " 1880 1891 1921 1956 1969 2012 2021 2028 2030 2059 2065 2144\n", + " 2149 2183 2210 2238 2292 2342 2357 2384 2414 2495 2531 2565\n", + " 2597 2662 2713 2781 2821 2829 2877 2904 2921 2927 2962 2973\n", + " 3044 3066 3094 3100 3106 3159 3193 3251 3377 3389 3397 3427\n", + " 3436 3570 3594 3703 3729 3770 3772 3780 3811 3840 3842 3860\n", + " 3907 3920 3929 3946 3955 3969 4005 4009 4034 4048 4077 4089\n", + " 4104 4129 4134 4178 4202 4212 4219 4239 4245 4256 4273 4373\n", + " 4407 4463 4464 4465 4481 4511 4537 4541 4543 4549 4597 4599\n", + " 4633 4759 4760 4789 4846 4884 4901 4930 4954 4971 4993 5024\n", + " 5030 5041 5050 5116 5130 5163 5207 5224 5282 5313 5322 5349\n", + " 5363 5403 5410 5412 5454 5543 5581 5590 5654 5673 5784 5821\n", + " 5849 5880 5911 5917 5982 6000 6062 6165 6178 6193 6200 6272\n", + " 6322 6351 6366 6376 6380 6382 6393 6412 6420 6430 6433 6446\n", + " 6476 6482 6488 6490 6519 6527 6540 6556 6563 6567 6577 6600\n", + " 6619 6680 6709 6735 6768 6777 6780 6823 6825 6826 6830 6863\n", + " 6880 6912 6988 7006 7030 7071 7077 7102 7123 7244 7264 7367\n", + " 7389 7390 7434 7451 7452 7455 7505 7532 7539 7589 7598 7620\n", + " 7651 7653 7659 7709 7714 7740 7751 7759 7803 7808 7820 7917\n", + " 7923 7926 7949 7962 7966 7978 8002 8004 8040 8050 8052 8068\n", + " 8180 8223 8250 8253 8265 8341 8344 8375 8376 8386 8449 8468\n", + " 8501 8509 8522 8535 8585 8590 8593 8642 8657 8674 8687 8707\n", + " 8714 8726 8729 8737 8756 8769 8801 8846 8850 8865 8907 8998\n", + " 9018 9043 9059 9066 9083 9093 9098 9130 9131 9165 9189 9216\n", + " 9285 9337 9368 9526 9539 9563 9620 9659 9723 9793 9804 9817\n", + " 9820 9827 9908 9995 10053 10128 10135 10143 10205 10253 10274 10292\n", + " 10300 10311 10327 10356 10406 10441 10491 10494 10551 10562 10563 10634\n", + " 10649 10674 10710 10734 10821 10831 10833 10838 10845 10911 10966 10981\n", + " 10988 10990 10998 11008 11044 11049 11100 11127 11141 11197 11250 11269\n", + " 11285 11308 11361 11383 11437 11460 11494 11502 11511 11522 11546 11557\n", + " 11564 11588 11649 11658 11671 11674 11703 11729 11749 11759 11832 11892\n", + " 11979 11988 12000 12038 12063 12078 12107 12119 12165 12259 12269 12270\n", + " 12347 12369 12386 12415 12475 12518 12566 12569 12574 12652 12693 12792\n", + " 12833 12834 12852 12872 12900 12946 13117 13121 13124 13321 13345 13357\n", + " 13427 13431 13446 13473 13526 13635 13638 13662 13706 13733 13803 13807\n", + " 13852 13882 13912 13924 13962 13969 13986 14023 14036 14046 14085 14110\n", + " 14130 14141 14175 14183 14191 14220 14222 14223 14285 14310 14331 14336\n", + " 14354 14375 14425 14427 14451 14482 14493 14516 14560 14563 14581 14623\n", + " 14671 14677 14679 14680 14685 14688 14742 14799 14860 14868 14870 14872\n", + " 14900 14909 14916 14940 14964 14991 15003 15023 15027 15033 15038 15051\n", + " 15086 15100 15184 15214 15232 15290 15352 15363 15365 15407 15433 15451\n", + " 15522 15577 15707 15720 15725 15739 15830 15837 15875 15937 15965 15985\n", + " 16017 16054 16113 16136 16142 16169 16191 16232 16238 16250 16268 16282\n", + " 16285 16290 16295 16304 16327 16334 16353 16356 16363 16382 16403 16407\n", + " 16408 16409 16458 16459 16495 16497 16499 16500 16516 16532 16595 16603\n", + " 16611 16657 16678 16680 16695 16701 16704 16754 16768 16807 16818 16856\n", + " 16870 16951 16971 16986 16989 16992 17048 17134 17181 17208 17217 17236\n", + " 17243 17319 17363 17398 17448 17471 17497 17557 17646 17654 17659 17692\n", + " 17754 17947 17957 17969 17975 18029 18128 18146 18196 18206 18207 18250\n", + " 18265 18313 18406]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-6.33203254e+13 -4.43651289e+13 6.35509366e+13 ... 1.08435585e+02\n", + " 9.42303467e+01 5.89958420e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 1.08435623e+02\n", + " 9.42303467e+01 5.89958954e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n", + " 671 731 763]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 5.0590863e+10 3.7823513e+11 -5.0394451e+11 ... -5.5814421e-01\n", + " 2.2970559e-01 -1.2293311e+00]\n", + "FF:[ 5.05906831e+10 3.78235290e+11 -5.03944544e+11 ... -5.58144033e-01\n", + " 2.29705781e-01 -1.22933090e+00]\n", + "[ True True True ... True True True]\n", + "[ 189 254 317 418 515 546 577 634 636 675 712 808 1011 1030\n", + " 1080 1091 1132 1168 1254 1265 1285 1287 1354 1381 1427 1459 1506 1620\n", + " 1654 1752 1887 1897 1900 1937 1981 1985 1986 2003 2029 2152 2181 2295\n", + " 2395 2426 2445 2673 2687 2859 2947 2977 3037]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n", + " 1.2655228e+01 7.1020460e+00]\n", + "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n", + " 1.26551876e+01 7.10206795e+00]\n", + "[ True True True ... True True True]\n", + "[ 9 49 113 174 243 267 271 288 323 335 397 399 438 439\n", + " 457 475 506 568 569 652 680 689 715 735 739 758 766 777\n", + " 785 837 842 852 865 884 893 919 930 932 936 939 957 1018\n", + " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n", + " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n", + " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n", + " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n", + " 2932 2948 3018 3034 3066]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_input_0\n", + "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n", + " 1.2655228e+01 7.1020460e+00]\n", + "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n", + " 1.26551876e+01 7.10206795e+00]\n", + "[ True True True ... True True True]\n", + "[ 9 49 113 174 243 267 271 288 323 335 397 399 438 439\n", + " 457 475 506 568 569 652 680 689 715 735 739 758 766 777\n", + " 785 837 842 852 865 884 893 919 930 932 936 939 957 1018\n", + " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n", + " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n", + " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n", + " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n", + " 2932 2948 3018 3034 3066]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_output_0\n", + "HF: [-6.3320325e+13 -4.4365129e+13 6.3550937e+13 ... 7.2449814e+01\n", + " 8.6617142e+01 8.3981407e+01]\n", + "FF:[-6.33203296e+13 -4.43651289e+13 6.35509408e+13 ... 7.24498901e+01\n", + " 8.66170959e+01 8.39814606e+01]\n", + "[ True True True ... True True True]\n", + "[ 26 51 66 85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n", + " 671 731 763]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_o_proj_in_grad\n", + "HF: [ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... 2.5297220e+02\n", + " -8.1722275e+01 -7.0014725e+01]\n", + "FF:[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... 2.52972260e+02\n", + " -8.17222137e+01 -7.00146637e+01]\n", + "[ True True True ... True True True]\n", + "[ 6 36 43 55 60 82 101 110 117 217 221 229 236 256 289 392 421 429\n", + " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 7.28854608e+13 6.37500977e+02 2.96775421e+02 ... 8.35403061e+01\n", + " 1.72460327e+02 2.90482426e+01]\n", + " [-6.08358210e+13 -5.23222847e+01 -2.34542664e+02 ... -1.87500763e+01\n", + " -8.99429398e+01 8.64021378e+01]\n", + " [-7.97326117e+13 -4.24736328e+02 -1.82208099e+02 ... 3.21808720e+00\n", + " -5.87415466e+01 -2.08511108e+02]\n", + " ...\n", + " [-1.13411917e+14 -3.48418640e+02 1.52205795e+02 ... 1.51519928e+02\n", + " 2.45651031e+02 2.52972198e+02]\n", + " [-3.75985275e+12 2.39696625e+02 1.51989685e+02 ... -2.85605354e+01\n", + " -1.79121232e+00 -8.17222748e+01]\n", + " [ 1.11016038e+14 -1.96372967e+01 -1.27668396e+02 ... 3.35008011e+01\n", + " -7.46116943e+01 -7.00147247e+01]]\n", + "FF:[[ 7.28854608e+13 6.37500977e+02 2.96775513e+02 ... 8.35403976e+01\n", + " 1.72460068e+02 2.90483646e+01]\n", + " [-6.08357832e+13 -5.23225098e+01 -2.34542755e+02 ... -1.87501526e+01\n", + " -8.99431992e+01 8.64022217e+01]\n", + " [-7.97326201e+13 -4.24736572e+02 -1.82207733e+02 ... 3.21793270e+00\n", + " -5.87416573e+01 -2.08511139e+02]\n", + " ...\n", + " [-1.13411925e+14 -3.48418640e+02 1.52205902e+02 ... 1.51519714e+02\n", + " 2.45650864e+02 2.52972260e+02]\n", + " [-3.75988630e+12 2.39696686e+02 1.51989319e+02 ... -2.85606136e+01\n", + " -1.79138493e+00 -8.17222137e+01]\n", + " [ 1.11016046e+14 -1.96372318e+01 -1.27668480e+02 ... 3.35009079e+01\n", + " -7.46116791e+01 -7.00146637e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 6 36 43 55 60 82 101 110 117 217 221 229 236 256 289 392 421 429\n", + " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... -1.1341192e+14\n", + " -3.7598527e+12 1.1101604e+14]\n", + " [ 3.3241980e+03 -6.3044128e+02 -3.0447307e+03 ... 3.0137921e+02\n", + " 3.8262988e+02 -4.2889914e+02]\n", + " [ 3.5639046e+01 -1.6155790e+01 -2.4461178e+01 ... 2.7450909e+02\n", + " 1.6181946e+02 -2.5407137e+02]\n", + " ...\n", + " [ 4.6487908e+00 -9.6633381e-01 -2.7078497e-01 ... 3.6374569e+01\n", + " -1.7563061e+00 -7.1206141e+00]\n", + " [ 1.8901447e+00 8.9006472e-01 -4.3125896e+00 ... 2.6014965e+01\n", + " -3.7720141e-01 -7.8855257e+00]\n", + " [ 1.9513500e+00 5.8041654e+00 -1.4006979e+01 ... 7.2743622e+01\n", + " -2.3499712e+01 -2.0133139e+01]]\n", + "FF:[[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... -1.13411925e+14\n", + " -3.75988630e+12 1.11016046e+14]\n", + " [ 3.32419922e+03 -6.30442505e+02 -3.04472998e+03 ... 3.01379364e+02\n", + " 3.82629669e+02 -4.28898712e+02]\n", + " [ 3.56390572e+01 -1.61558037e+01 -2.44611683e+01 ... 2.74509308e+02\n", + " 1.61819229e+02 -2.54071594e+02]\n", + " ...\n", + " [ 4.64879847e+00 -9.66338813e-01 -2.70792574e-01 ... 3.63745117e+01\n", + " -1.75632846e+00 -7.12060070e+00]\n", + " [ 1.89013767e+00 8.90062451e-01 -4.31257772e+00 ... 2.60149212e+01\n", + " -3.77217919e-01 -7.88551569e+00]\n", + " [ 1.95135939e+00 5.80417490e+00 -1.40069904e+01 ... 7.27435226e+01\n", + " -2.34996586e+01 -2.01330910e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "7.609953703703703% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-1.17282076e+14 -2.12461621e+03 8.80099030e+01 ... 4.34470520e+01\n", + " 7.55885468e+01 -2.88791332e+01]\n", + " [-2.07757936e+14 -3.81796265e+02 -2.33774780e+02 ... 8.11984329e+01\n", + " -4.41825638e+01 7.35064125e+00]\n", + " [ 4.11484165e+13 2.50572113e+02 1.91601822e+02 ... 1.00269365e+01\n", + " -3.41638985e+01 1.20433075e+02]\n", + " ...\n", + " [ 7.95562329e+13 1.55007373e+03 1.70351212e+02 ... -1.80320053e+01\n", + " 8.77533417e+01 2.14678173e+01]\n", + " [-1.86546485e+14 -5.18847070e+03 -3.34331085e+02 ... 2.51586838e+01\n", + " -4.06135368e+01 -6.27860641e+00]\n", + " [ 1.89751705e+14 -3.09853809e+03 -1.18278351e+01 ... -1.24640663e+02\n", + " 1.59719009e+01 -6.47173615e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-1.17282034e+14 -2.12461694e+03 8.80101547e+01 ... 4.34468918e+01\n", + " 7.55886002e+01 -2.88791542e+01]\n", + " [-2.07757920e+14 -3.81795776e+02 -2.33774765e+02 ... 8.11985397e+01\n", + " -4.41825829e+01 7.35066986e+00]\n", + " [ 4.11484543e+13 2.50570099e+02 1.91601196e+02 ... 1.00270777e+01\n", + " -3.41638451e+01 1.20433121e+02]\n", + " ...\n", + " [ 7.95562413e+13 1.55007288e+03 1.70350784e+02 ... -1.80321960e+01\n", + " 8.77533112e+01 2.14678249e+01]\n", + " [-1.86546469e+14 -5.18847070e+03 -3.34331268e+02 ... 2.51588135e+01\n", + " -4.06132622e+01 -6.27861023e+00]\n", + " [ 1.89751521e+14 -3.09853711e+03 -1.18275299e+01 ... -1.24640862e+02\n", + " 1.59719791e+01 -6.47173767e+01]]\n", + "7.530381944444445% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 7.7873253e+01\n", + " 8.6085976e+01 6.8200005e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 7.78733292e+01\n", + " 8.60859299e+01 6.82000580e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181\n", + " 226 261 284 318 320 378 382 385 391 395 403 422\n", + " 434 495 515 523 524 549 579 610 644 710 764 772\n", + " 870 984 987 1045 1249 1330 1362 1489 1517 1550 1556 1588\n", + " 1595 1659 1672 1684 1689 1768 1792 1799 1808 1818 1842 1871\n", + " 1889 1899 1910 1915 1925 1936 1993 1997 2033 2041 2059 2062\n", + " 2066 2098 2111 2124 2129 2130 2146 2153 2159 2166 2197 2206\n", + " 2210 2212 2222 2234 2237 2320 2321 2357 2359 2362 2385 2428\n", + " 2518 2539 2553 2568 2598 2683 2689 2694 2711 2714 2733 2787\n", + " 2788 2795 2811 2815 2853 2881 2890 2917 2981 2997 3021 3037\n", + " 3089 3149 3163 3191 3196 3217 3225 3248 3277 3287 3292 3305\n", + " 3327 3361 3385 3402 3417 3425 3456 3479 3516 3521 3528 3555\n", + " 3587 3599 3608 3684 3702 3733 3770 3779 3819 3822 3823 3898\n", + " 3921 3942 3950 4012 4053 4077 4086 4091 4139 4185 4198 4225\n", + " 4241 4296 4347 4349 4368 4403 4407 4418 4453 4471 4472 4473\n", + " 4494 4537 4549 4555 4558 4598 4623 4648 4666 4698 4729 4782\n", + " 4848 4866 4886 4943 4959 5008 5010 5012 5057 5079 5177 5178\n", + " 5186 5211 5271 5281 5296 5313 5328 5356 5364 5409 5429 5440\n", + " 5453 5455 5457 5476 5529 5563 5591 5621 5625 5631 5654 5661\n", + " 5692 5705 5720 5740 5751 5758 5787 5799 5813 5835 5836 5867\n", + " 5872 5893 5953 5974 5980 5982 6000 6055 6082 6086 6102 6107\n", + " 6123 6159 6172 6193 6220 6230 6231 6263 6286 6297 6362 6396\n", + " 6401 6430 6436 6485 6497 6499 6502 6510 6537 6554 6555 6563\n", + " 6564 6579 6586 6598 6615 6625 6626 6649 6651 6661 6754 6764\n", + " 6776 6852 6863 6874 6883 6892 6913 6945 6969 7036 7057 7066\n", + " 7082 7138 7147 7150 7157 7197 7202 7231 7234 7235 7240 7270\n", + " 7278 7287 7322 7327 7345 7348 7361 7390 7402 7490 7539 7573\n", + " 7610 7714 7721 7758 7794 7812 7827 7829 7837 7839 7882 7894\n", + " 7943 7948 7952 7969 7975 7996 8024 8027 8037 8043 8055 8078\n", + " 8079 8088 8090 8095 8154 8258 8264 8283 8297 8313 8329 8336\n", + " 8359 8361 8376 8383 8416 8421 8428 8454 8475 8502 8521 8613\n", + " 8642 8653 8696 8756 8764 8777 8791 8837 8849 8859 8878 8955\n", + " 8991 8997 9006 9012 9040 9066 9093 9097 9098 9131 9158 9162\n", + " 9165 9214 9216 9280 9297 9301 9316 9355 9371 9412 9421 9475\n", + " 9510 9580 9620 9645 9696 9713 9732 9768 9802 9817 9819 9826\n", + " 9839 9846 9947 10004 10062 10065 10072 10103 10107 10108 10138 10167\n", + " 10173 10228 10262 10292 10326 10356 10360 10372 10421 10446 10466 10468\n", + " 10499 10505 10513 10517 10589 10606 10612 10645 10664 10669 10726 10777\n", + " 10835 10838 10839 10848 10855 10877 10897 10941 10963 10971 10977 10997\n", + " 11030 11060 11065 11076 11088 11140 11167 11174 11231 11252 11257 11259\n", + " 11275 11297 11302 11319 11331 11333 11357 11358 11380 11382 11402 11423\n", + " 11446 11447 11500 11501 11522 11585 11623 11670 11728 11736 11759 11761\n", + " 11772 11785 11839 11894 11916 11924 11936 11962 11968 11969 11977 11984\n", + " 12008 12030 12054 12074 12123 12175 12182 12194 12237 12262 12282 12285\n", + " 12341 12348 12351 12370 12376 12386 12399 12449 12507 12513 12518 12522\n", + " 12549 12572 12643 12648 12663 12689 12696 12710 12769 12780 12788 12792\n", + " 12793 12852 12864 12879 12884 12985 13018 13041 13057 13176 13264 13272\n", + " 13274 13275 13292 13303 13333 13379 13427 13428 13442 13451 13454 13500\n", + " 13510 13533 13564 13588 13607 13640 13655 13686 13687 13688 13732 13747\n", + " 13786 13801 13803 13826 13841 13846 13850 13892 13909 13946 14036 14040\n", + " 14046 14060 14080 14152 14161 14183 14195 14210 14240 14278 14331 14354\n", + " 14370 14372 14386 14395 14409 14432 14434 14497 14506 14531 14559 14589\n", + " 14648 14663 14686 14698 14715 14743 14757 14799 14808 14810 14849 14893\n", + " 14902 14929 14937 14947 14953 14958 15005 15012 15018 15036 15066 15069\n", + " 15083 15152 15154 15196 15197 15212 15292 15309 15323 15340 15343 15375\n", + " 15389 15396 15408 15410 15454 15499 15532 15557 15605 15647 15677 15736\n", + " 15745 15756 15769 15809 15824 15876 15882 15900 15906 15941 16027 16030\n", + " 16040 16116 16190 16192 16205 16207 16239 16279 16285 16295 16348 16358\n", + " 16367 16384 16386 16394 16399 16455 16457 16458 16471 16495 16500 16502\n", + " 16520 16541 16542 16598 16623 16643 16651 16665 16673 16679 16713 16725\n", + " 16734 16736 16739 16751 16756 16768 16861 16870 16939 16976 17007 17028\n", + " 17040 17069 17087 17108 17125 17139 17151 17158 17174 17175 17178 17182\n", + " 17189 17221 17258 17341 17360 17370 17381 17395 17396 17415 17432 17450\n", + " 17463 17470 17472 17473 17496 17507 17536 17608 17626 17627 17649 17653\n", + " 17664 17771 17815 17822 17831 17864 17883 17931 17994 17999 18035 18174\n", + " 18209 18250 18274 18307 18327 18403 18423]\n", + "Ok!\n", + "Ok!\n", + "-- Lora --\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 7.7873253e+01\n", + " 8.6085976e+01 6.8200005e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 7.78733292e+01\n", + " 8.60859299e+01 6.82000580e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181 226 261 284 318 320 378\n", + " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_input_0\n", + "HF: [ 6.5550952e+14 4.9376585e+14 3.8510841e+14 ... 1.6802770e+00\n", + " -1.1248941e+00 -1.1701980e+00]\n", + "FF:[ 6.55509317e+14 4.93765882e+14 3.85108377e+14 ... 1.68027747e+00\n", + " -1.12489426e+00 -1.17019880e+00]\n", + "[ True True True ... True True True]\n", + "[ 6 79 111 149 155 168 187 195 220 223 252 261 329 343\n", + " 347 369 386 392 403 438 439 450 461 524 535 643 656 659\n", + " 661 668 722 727 732 742 754 801 816 820 835 837 849 850\n", + " 978 993 997 1012 1019 1034 1044 1071 1088 1094 1114 1135 1151 1170\n", + " 1190 1212 1273 1275 1277 1289 1290 1308 1311 1337 1364 1379 1394 1430\n", + " 1454 1460 1469 1474 1703 1725 1728 1732 1733 1741 1754 1757 1804 1806\n", + " 1856 1862 1932 1945 1996 2030 2044 2045 2065 2071 2075 2094 2149 2152\n", + " 2163 2180 2182 2215 2254 2357 2362 2370 2392 2398 2428 2484 2519 2521\n", + " 2524 2582 2618 2641 2645 2664 2674 2681 2691 2735 2747 2779 2872 2899\n", + " 2909 2935 2957 3000 3033]\n", + "Ok!\n", + "-- W2/W1/W3 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_SigmoidSiluMulti_shard_0_output_0\n", + "HF: [-1.3871785e+17 -8.3164397e+16 4.9509505e+16 ... 4.3806694e+01\n", + " 9.4386072e+00 -2.4460859e+01]\n", + "FF:[-1.38717840e+17 -8.31644654e+16 4.95094495e+16 ... 4.38065948e+01\n", + " 9.43864822e+00 -2.44608364e+01]\n", + "[ True True True ... True True True]\n", + "[ 80 83 172 173 176 184 215 285 329 338 341 395 403 465\n", + " 468 565 572 601 614 636 639 651 660 749 750 806 828 844\n", + " 873 952 971 988 992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n", + " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n", + " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n", + " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n", + " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n", + " 2936 2945 2951 3013 3016]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_input_0\n", + "HF: [-1.3871785e+17 -8.3164397e+16 4.9509505e+16 ... 4.3806694e+01\n", + " 9.4386072e+00 -2.4460859e+01]\n", + "FF:[-1.38717840e+17 -8.31644654e+16 4.95094495e+16 ... 4.38065948e+01\n", + " 9.43864822e+00 -2.44608364e+01]\n", + "[ True True True ... True True True]\n", + "[ 80 83 172 173 176 184 215 285 329 338 341 395 403 465\n", + " 468 565 572 601 614 636 639 651 660 749 750 806 828 844\n", + " 873 952 971 988 992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n", + " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n", + " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n", + " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n", + " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n", + " 2936 2945 2951 3013 3016]\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "-- Attention --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_output_0\n", + "HF: [-1.3223293e+17 -2.3794983e+17 4.7027590e+16 ... 3.5121140e+01\n", + " -3.5587997e+00 9.5641022e+01]\n", + "FF:[-1.32232886e+17 -2.37949812e+17 4.70276284e+16 ... 3.51211472e+01\n", + " -3.55898285e+00 9.56410980e+01]\n", + "[ True True True ... True True True]\n", + "[ 3 24 66 71 94 95 124 134 141 150 163 181 226 261 284 318 320 378\n", + " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_o_proj_in_grad\n", + "HF: [-1.6186993e+17 -3.5698813e+17 3.4442975e+16 ... -2.5844165e+02\n", + " 2.0677340e+01 -2.4573349e+01]\n", + "FF:[-1.61869621e+17 -3.56988336e+17 3.44430865e+16 ... -2.58441467e+02\n", + " 2.06775093e+01 -2.45735531e+01]\n", + "[ True True True ... True True True]\n", + "[ 93 99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n", + " 608 702 720 731 759]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-1.6186993e+17 -2.1968115e+02 8.5754425e+01 ... -6.9909119e+01\n", + " -2.6478451e+01 -7.4195160e+01]\n", + " [-3.5698813e+17 3.9582391e+02 5.5431940e+02 ... 1.9529277e+02\n", + " 1.2558211e+02 6.7965935e+01]\n", + " [ 3.4442975e+16 2.8310864e+02 -8.1522171e+01 ... -2.3606525e+01\n", + " -2.0410315e+01 -1.5228156e+02]\n", + " ...\n", + " [ 4.0923264e+16 -2.4507169e+02 -8.2614380e+02 ... -2.6583340e+02\n", + " -1.9878247e+02 -2.5844165e+02]\n", + " [ 6.9156258e+17 1.3969666e+02 -7.5639044e+02 ... -1.5231053e+02\n", + " -3.3650037e+02 2.0677340e+01]\n", + " [ 9.9511712e+16 -3.2348724e+01 3.0624988e+02 ... 1.0391423e+02\n", + " 6.0626881e+01 -2.4573349e+01]]\n", + "FF:[[-1.61869621e+17 -2.19681122e+02 8.57541504e+01 ... -6.99092026e+01\n", + " -2.64783611e+01 -7.41952515e+01]\n", + " [-3.56988336e+17 3.95823853e+02 5.54319275e+02 ... 1.95292725e+02\n", + " 1.25582062e+02 6.79659348e+01]\n", + " [ 3.44430865e+16 2.83108551e+02 -8.15224686e+01 ... -2.36064014e+01\n", + " -2.04101429e+01 -1.52281570e+02]\n", + " ...\n", + " [ 4.09233933e+16 -2.45071564e+02 -8.26143555e+02 ... -2.65833405e+02\n", + " -1.98782272e+02 -2.58441467e+02]\n", + " [ 6.91562577e+17 1.39696579e+02 -7.56390808e+02 ... -1.52310455e+02\n", + " -3.36500092e+02 2.06775093e+01]\n", + " [ 9.95114373e+16 -3.23486938e+01 3.06250122e+02 ... 1.03914482e+02\n", + " 6.06264191e+01 -2.45735531e+01]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[ 93 99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n", + " 608 702 720 731 759]\n", + "Ok!\n", + "mismatch between hf_tensor and ff_tensor\n", + "HF: [[-1.6186993e+17 -3.5698813e+17 3.4442975e+16 ... 4.0923264e+16\n", + " 6.9156258e+17 9.9511712e+16]\n", + " [-5.3483575e+02 2.6249797e+03 -6.7268573e+02 ... -6.1204077e+03\n", + " -4.3047915e+03 -9.5139771e+01]\n", + " [-1.2200641e+01 1.0347147e+02 -2.6777636e+01 ... -1.4766699e+02\n", + " -9.8514114e+01 1.2616925e+01]\n", + " ...\n", + " [-3.2097631e+00 9.1431990e+00 -1.6333975e+00 ... -6.9996667e+00\n", + " -6.4008064e+00 1.9126304e+00]\n", + " [-3.0982289e+00 1.2355285e+01 -3.1715555e+00 ... -4.6754313e+00\n", + " -6.2553053e+00 1.0515085e+00]\n", + " [-2.9516125e+00 2.7038031e+00 -6.0580249e+00 ... -1.6555168e+01\n", + " 1.3245420e+00 -1.5741113e+00]]\n", + "FF:[[-1.61869621e+17 -3.56988336e+17 3.44430865e+16 ... 4.09233933e+16\n", + " 6.91562577e+17 9.95114373e+16]\n", + " [-5.34834961e+02 2.62497900e+03 -6.72686401e+02 ... -6.12040576e+03\n", + " -4.30479297e+03 -9.51402283e+01]\n", + " [-1.22006664e+01 1.03471611e+02 -2.67777309e+01 ... -1.47666946e+02\n", + " -9.85141525e+01 1.26169167e+01]\n", + " ...\n", + " [-3.20977211e+00 9.14321709e+00 -1.63339353e+00 ... -6.99966621e+00\n", + " -6.40081263e+00 1.91262615e+00]\n", + " [-3.09821057e+00 1.23552399e+01 -3.17152786e+00 ... -4.67541933e+00\n", + " -6.25528765e+00 1.05149710e+00]\n", + " [-2.95161533e+00 2.70380235e+00 -6.05802393e+00 ... -1.65551491e+01\n", + " 1.32455230e+00 -1.57412362e+00]]\n", + "[[ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " ...\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]\n", + " [ True True True ... True True True]]\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "Ok!\n", + "8.101851851851851% mismatch in QK prods softmax out grad\n", + "Ok!\n", + "hf_attn_in: (768, 24)\n", + "[[-7.3778828e+16 1.0956941e+03 1.1773144e+02 ... -4.0466427e+01\n", + " -3.1198654e+01 -1.7603550e+01]\n", + " [-1.2087128e+18 6.9384756e+03 6.1327003e+01 ... 1.5329468e+01\n", + " 7.6757736e+00 -4.5589094e+00]\n", + " [-6.7892266e+17 5.4895034e+03 7.6927376e+01 ... 9.1396770e+00\n", + " 2.3195824e+01 -6.1995559e+00]\n", + " ...\n", + " [ 2.6452032e+17 9.9761787e+03 2.2349066e+02 ... 5.7504387e+01\n", + " -8.6791611e-01 4.6890911e+01]\n", + " [-6.7528534e+16 3.3856902e+03 2.5189743e+02 ... 2.2824722e+01\n", + " 8.7917282e+01 -2.1569672e+01]\n", + " [-2.1779064e+17 5.2511855e+03 6.6282043e+01 ... 9.9689598e+00\n", + " -5.5022659e+00 -3.2573143e+01]]\n", + "ff_attn_in: (768, 24)\n", + "[[-7.37791458e+16 1.09569678e+03 1.17731285e+02 ... -4.04664154e+01\n", + " -3.11988506e+01 -1.76035423e+01]\n", + " [-1.20871251e+18 6.93847900e+03 6.13275528e+01 ... 1.53295393e+01\n", + " 7.67594433e+00 -4.55900288e+00]\n", + " [-6.78922523e+17 5.48950342e+03 7.69272308e+01 ... 9.13961220e+00\n", + " 2.31957569e+01 -6.19959354e+00]\n", + " ...\n", + " [ 2.64520284e+17 9.97617871e+03 2.23490509e+02 ... 5.75044785e+01\n", + " -8.67943764e-01 4.68908234e+01]\n", + " [-6.75287400e+16 3.38569165e+03 2.51897339e+02 ... 2.28247147e+01\n", + " 8.79171448e+01 -2.15696106e+01]\n", + " [-2.17790679e+17 5.25118652e+03 6.62821960e+01 ... 9.96885872e+00\n", + " -5.50213098e+00 -3.25731125e+01]]\n", + "9.809027777777777% mismatch in attention input grads\n", + "\n", + "Huggingface checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "FlexFlow checks:\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "\n", + "Huggingface-FlexFlow checks:\n", + "-- W2 --\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.7.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_7_layers_7_feed_forward_w2_shard_0_output_0\n", + "HF: [-7.5522525e+19 -1.3283726e+21 -7.2549753e+20 ... 4.9017162e+01\n", + " -9.7436657e+00 8.5870697e+01]\n", + "FF:[-7.55228501e+19 -1.32837218e+21 -7.25497390e+20 ... 4.90171394e+01\n", + " -9.74382782e+00 8.58707886e+01]\n", + "[ True True True ... True False True]\n", + "[ 19 64 75 ... 18418 18428 18430]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[23], line 95\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- W2 --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 95\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m compare_tensors(hf_w2_weight, ff_w2_weight, tolerance\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m)\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- Lora --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:47\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "attention_tests=True\n", + "for i in range(tot_num_layers-1, -1, -1):\n", + " # HuggingFace filepaths\n", + " hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n", + " hf_BWD_loraB_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0\"\n", + " hf_BWD_loraB_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0\"\n", + " hf_BWD_loraA_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0\"\n", + " hf_BWD_loraA_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0\"\n", + " hf_loraA_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n", + " hf_loraB_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n", + " hf_BWD_lora_dropout_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.go_0\"\n", + " hf_BWD_lora_dropout_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.gi_0\"\n", + " hf_BWD_w2_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0\"\n", + " hf_BWD_w2_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0\"\n", + " hf_w2_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.weight\"\n", + " hf_BWD_w3_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0\"\n", + " hf_BWD_w3_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0\"\n", + " hf_BWD_w1_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0\"\n", + " hf_BWD_w1_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0\"\n", + " hf_BWD_act_fn_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0\"\n", + " hf_BWD_act_fn_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.go_0\"\n", + " hf_BWD_ffn_norm_out = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0\"\n", + " hf_BWD_ffn_norm_in = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0\"\n", + " hf_BWD_attn_out_out = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0\"\n", + " hf_BWD_attn_q_in = f\"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n", + " hf_FWD_w1_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n", + " hf_FWD_w3_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\"\n", + " hf_FWD_act_fn_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0\"\n", + " hf_BWD_attn_oproj_in = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n", + " hf_attn_qproj_weight = f\"{hf_path}/layers.{i}.self_attn.q_proj.weight\"\n", + " hf_attn_kproj_weight = f\"{hf_path}/layers.{i}.self_attn.k_proj.weight\"\n", + " hf_attn_vproj_weight = f\"{hf_path}/layers.{i}.self_attn.v_proj.weight\"\n", + " hf_attn_oproj_weight = f\"{hf_path}/layers.{i}.self_attn.o_proj.weight\"\n", + " \n", + " # FlexFlow filepaths\n", + " ff_BWD_w2_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n", + " ff_BWD_w2_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n", + " ff_BWD_w2_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_pre_input_0\"\n", + " ff_w2_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_weight_0\"\n", + " ff_BWD_ssm_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0\"\n", + " ff_BWD_ssm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0\"\n", + " ff_BWD_ssm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1\"\n", + " ff_BWD_w3_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n", + " ff_BWD_w3_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_input_0\"\n", + " ff_BWD_lora_A_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n", + " ff_BWD_lora_B_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n", + " ff_lora_A_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n", + " ff_lora_B_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n", + " ff_BWD_w1_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n", + " ff_BWD_w1_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_input_0\"\n", + " ff_BWD_w1_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_pre_input_0\"\n", + " ff_w1_weight = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_weight_0\"\n", + " ff_BWD_ffn_norm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_0\"\n", + " ff_BWD_ffn_norm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_1\"\n", + " ff_BWD_ffn_norm_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_0\"\n", + " ff_BWD_attn_out = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n", + " ff_BWD_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_input_0\"\n", + " ff_BWD_ssm_cached_w1_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w1_output\"\n", + " ff_BWD_ssm_cached_w3_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w3_output\"\n", + " ff_FWD_w1_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w1_shard_0_output_0\"\n", + " ff_FWD_w3_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w3_shard_0_output_0\"\n", + " ff_FWD_act_fnc_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_act_fn_output\"\n", + " ff_BWD_attn_o_proj_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + " ff_attn_oproj_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_weight_0\"\n", + " \n", + " \n", + " # HuggingFace checks\n", + " print(\"\\nHuggingface checks:\")\n", + " if i == tot_num_layers-1:\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n", + " compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n", + " compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n", + "\n", + " compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n", + " check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n", + " if i == tot_num_layers-1:\n", + " check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n", + "\n", + " # FlexFlow checks\n", + " print(\"\\nFlexFlow checks:\")\n", + " compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n", + " compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n", + " compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n", + " compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n", + " compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n", + " compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n", + " \n", + " # HF-FlexFlow checks\n", + " print(\"\\nHuggingface-FlexFlow checks:\")\n", + " print(\"-- W2 --\")\n", + " compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " \n", + " print(\"-- Lora --\")\n", + " compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n", + " compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n", + "\n", + " compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n", + " compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n", + " \n", + " print(\"-- W2/W1/W3 --\")\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n", + " compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n", + " compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n", + " compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n", + " compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n", + " \n", + " print(\"-- Attention --\")\n", + " compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n", + " hidden_size = 768\n", + " qProjSize = 64\n", + " num_heads = 12\n", + " num_new_tokens = num_tokens = 24\n", + " if attention_tests:\n", + " # compare attn weight tensors\n", + " ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n", + " ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n", + " ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n", + " \n", + " hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n", + " hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n", + " \n", + " assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n", + " assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n", + " \n", + " # Compare attn outproj grad in tensors\n", + " compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n", + " \n", + " ########### Compare value projs grads ######################\n", + " # 1. compare qk prods softmax\n", + " hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.qk_prods_softmax.output_0\"\n", + " ff_attn_qk_prods_softmax = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax\"\n", + " \n", + " hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n", + " ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + "\n", + " for head_idx in range(num_heads):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + " \n", + " # 2. compare attn heads grads\n", + " hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n", + " ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + "\n", + " hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n", + " # NEED TO VISUALLY INSPECT\n", + " compare_loaded_tensors(hf_attn_heads_grads, ff_attn_heads_grads)\n", + "\n", + " # 3. vproj grads\n", + " hf_vproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0\"\n", + " ff_vproj_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_v_proj_in_grad\"\n", + "\n", + " hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n", + " ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n", + " compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads)\n", + " \n", + " \n", + " ##############################\n", + " hf_value_states = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.value_states.output_0\"\n", + " hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " # print(hf_value_states.shape)\n", + " ff_value_states = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_vcache\"\n", + " ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n", + " # print(ff_value_states.shape)\n", + " assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n", + " \n", + " \n", + " \n", + " ########## Compare key and query projs grads ##################\n", + " ff_devQKVPRojArray = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n", + " ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n", + " ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n", + " ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n", + " ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n", + " assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n", + "\n", + " # simulate qk_prods_softmax\n", + " ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n", + " ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n", + " ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n", + " ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n", + " ff_value_states = torch.from_numpy(ff_value_states)\n", + " ff_value_states = ff_value_states.permute(1,0,2)\n", + " # print(ff_attn_heads_grads.shape)\n", + " # print(ff_value_states.shape)\n", + " simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n", + " #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n", + " #print(\"Simulated QK prods grads:\")\n", + " #print(simulated_qk_prods_softmax_grads[0,:,:])\n", + "\n", + " # qk prods softmax right before softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.qk_prods_softmax.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " \n", + " mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n", + " # print(hf_qk_prods_softmax2[:2,:,0])\n", + " # print(ff_qk_prods_softmax2[:2,:,0])\n", + " assert(pct_mismatch <= 0.1)\n", + "\n", + " # qk prods softmax right after softmax\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.pre_softmax.gi_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " compare_loaded_tensors(hf_qk_prods_softmax2, ff_qk_prods_softmax2)\n", + " \n", + " # qk prods softmax after mask\n", + " hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.matmul_op.go_0\"\n", + " hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n", + " ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in_masked\"\n", + " ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n", + " hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n", + " assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n", + "\n", + " # Compare query activation\n", + " hf_query_activation = hf_path + f\"/fwd_step_0_layers.11.self_attn.query_activation.output_0\"\n", + " hf_query_activation = torch.load(hf_query_activation)\n", + " ff_query_activation = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_query_activation\"\n", + " ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n", + " hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n", + " # assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n", + " # print(hf_query_activation[:,0,:])\n", + " # print()\n", + " # print(ff_query_activation[:,0,:])\n", + " # assert False\n", + " # compare_loaded_tensors(hf_query_activation, ff_query_activation)\n", + " check_rope = False\n", + " if check_rope:\n", + " ########################################## ROPE and Kproj ##########################################\n", + "\n", + " # Compare FF kproj with intermediate kproj data from HF\n", + " hf_kproj_grads_post_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_post_rotary.go_0\"\n", + " hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n", + " hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n", + " # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n", + " # Check hf ROPE \n", + " cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n", + " cos = cos.cuda()\n", + " sin = sin.cuda()\n", + " # query_states: torch.Size([1, 12, 24, 64])\n", + " # key_states: torch.Size([1, 12, 24, 64])\n", + " # position_ids: torch.Size([1, 24])\n", + " # tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " # 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + " query_states = torch.zeros([1, 12, 24, 64]).cuda()\n", + " position_ids = torch.arange(24).unsqueeze(0).cuda()\n", + " query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n", + " hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n", + " # print(hf_kproj_grads_post_rotary[:,:,0])\n", + " \n", + " hf_kproj_grads_before_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_before_rotary.go_0\"\n", + " hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n", + " hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n", + " # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n", + " # print(hf_kproj_grads_before_rotary[:,:,0])\n", + " # Compare HF rope with manual ROPE\n", + " assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " # Compare HF Kproj with FF Kproj (before ROPE) \n", + " ff_kproj_pre = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj_pre\"\n", + " ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n", + " #print(ff_kproj_pre[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n", + " \n", + " ff_kproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj\"\n", + " ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n", + " # print(\"ff_kproj: \", ff_kproj.shape)\n", + " #print(ff_kproj[:,:,0])\n", + " mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n", + " print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n", + " assert(pct_mismatch <= 0.05)\n", + " #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n", + " \n", + " \n", + " #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n", + " hf_kproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0\"\n", + " hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n", + " #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n", + " #print(hf_kproj_grads[:,:64])\n", + " reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " #print(reshaped_tensor.shape)\n", + " assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " ########################################## Qproj (with ROPE) ##########################################\n", + "\n", + " # Compare QProj\n", + " hf_qproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0\"\n", + " hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n", + " # print(\"HF Qproj:\")\n", + " # print(hf_qproj_grads.shape)\n", + " reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n", + " # print(\"\\t reshaped: \", reshaped_tensor.shape)\n", + " # print(reshaped_tensor[:,:,0])\n", + " ff_qproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n", + " ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n", + " # print(\"FF Qproj:\")\n", + " # print(ff_qproj.shape)\n", + " # print(ff_qproj[:,:,0])\n", + " assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n", + "\n", + " hf_attn_in = f\"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0\"\n", + " hf_attn_in = torch.load(hf_attn_in)\n", + " hf_attn_in = hf_attn_in.squeeze().T\n", + " hf_attn_in = hf_attn_in.detach().cpu().numpy()\n", + " print(\"hf_attn_in: \", hf_attn_in.shape)\n", + " print(hf_attn_in)\n", + "\n", + " ff_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in\"\n", + " ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n", + " print(\"ff_attn_in: \", ff_attn_in.shape)\n", + " print(ff_attn_in)\n", + " #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n", + "\n", + " mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n", + " mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n", + " pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n", + " print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n", + " assert(pct_mismatch <= 0.1)\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.01614726 0.01363804 0.01768043 ... 0.00724926 -0.00149747\n", + " -0.01781223]\n" + ] + } + ], + "source": [ + "a = np.fromfile(\"/usr0/home/goliaro/.cache/flexflow/weights/goliaro/llama-160m-lora-full/full-precision/layers_11_feed_forward_w2_lora_A_weight\", dtype=np.float32)\n", + "print(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# value states: torch.Size([1, 12, 24, 64])\n", + "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n", + "key_states = value_states\n", + "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n", + "# query_states: torch.Size([1, 12, 24, 64])\n", + "# key_states: torch.Size([1, 12, 24, 64])\n", + "# position_ids: torch.Size([1, 24])\n", + "# tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + "# 18, 19, 20, 21, 22, 23]], device='cuda:0')\n", + "query_states = torch.zeros([1, 12, 24, 64])\n", + "position_ids = torch.arange(24).unsqueeze(0)\n", + "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n", + "key_states = key_states.squeeze()\n", + "print(key_states.shape)\n", + "print(key_states[0,:,:])\n", + "print(hf_kproj_grads_before_rotary.shape)\n", + "print(hf_kproj_grads_before_rotary[:,:,0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " 18, 19, 20, 21, 22, 23]], device='cuda:0')" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.arange(24).unsqueeze(0).cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 12, 24, 24])\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 17\u001b[0m ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m 18\u001b[0m \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> 19\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m 21\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "layer_num = 11\n", + "hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n", + "ff_qk_prods_softmax = f\"{ff_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n", + "\n", + "hf_value_states = f\"{hf_path}/fwd_step_0_layers.11.self_attn.value_states\"\n", + "\n", + "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n", + "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n", + "print(hf_qk_prods_softmax.shape)\n", + "#print(ff_qk_prods_softmax.shape)\n", + "#print(hf_qk_prods_softmax[:,:,0])\n", + "#print()\n", + "#print(ff_qk_prods_softmax[:,:,0])\n", + "\n", + "for head_idx in range(12):\n", + " hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n", + " ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n", + " assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n", + "\n", + "\n", + "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n", + "print(hf_value_states.shape)\n", + "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n", + "print()\n", + "print(attn_output.shape)\n", + "print(attn_output.transpose(1, 2).contiguous().shape)\n", + "print(\"Hf attn heads\")\n", + "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n", + "\n", + "print(\"Attn heads grads:\")\n", + "hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n", + "print(torch.load(hf_attn_heads_grads).shape)\n", + "print(\"HF value grads:\")\n", + "vproj_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n", + "print(torch.load(vproj_grads).shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 3, 4])\n", + "torch.Size([4, 3, 2])\n" + ] + } + ], + "source": [ + "a = torch.randn(2,3,4)\n", + "print(a.shape)\n", + "print(a.T.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000,\n", + " 0.0000],\n", + " [ 27.8890, -21.5089, 45.8214, ..., 5.4010, -10.8787,\n", + " 39.7619],\n", + " [ 19.2197, 27.4681, -68.7141, ..., 102.3280, 66.7925,\n", + " -160.8711],\n", + " ...,\n", + " [ 63.9532, 17.4273, -29.4416, ..., 101.6105, 67.5937,\n", + " -198.4432],\n", + " [ 31.2799, 13.0724, -44.7179, ..., 132.4898, 42.3135,\n", + " -194.4037],\n", + " [ 42.3453, -16.2693, -55.7386, ..., 90.5921, 52.2032,\n", + " -124.1802]]], device='cuda:0')\n", + "tensor([[[-1.1845e+06, -6.7460e+05, 7.4494e+05, ..., -9.1441e+05,\n", + " -1.4912e+05, 3.5769e+06],\n", + " [-7.3920e+01, -7.9389e+01, 1.1027e+02, ..., -7.3020e+01,\n", + " -2.3540e+01, 3.4587e+02],\n", + " [-5.3885e+01, -1.7373e+01, -1.9780e+01, ..., 4.1291e+01,\n", + " 5.5099e+01, 5.5910e+01],\n", + " ...,\n", + " [-2.1948e+01, -3.2109e+01, 2.8364e+01, ..., 3.4321e+01,\n", + " 5.0713e+01, 5.6592e+01],\n", + " [-4.4339e+01, -2.8339e+01, 1.4070e+01, ..., 6.2797e+01,\n", + " 3.0760e+01, 6.1743e+01],\n", + " [-1.6287e+01, -5.0413e+01, -1.9940e+01, ..., 4.3766e+01,\n", + " 4.7833e+01, 4.7295e+01]]], device='cuda:0')\n" + ] + } + ], + "source": [ + "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n", + "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n", + "a = torch.load(a)\n", + "b = torch.load(b)\n", + "print(a)\n", + "print(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "for layer_num in range(12):\n", + " hf_lora_A_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n", + " hf_w1_weight = f\"{hf_path}/layers.{layer_num}.mlp.gate_proj.weight\"\n", + " ff_w1_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n", + " hf_w3_weight = f\"{hf_path}/layers.{layer_num}.mlp.up_proj.weight\"\n", + " ff_w3_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n", + " hf_w2_weight = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.weight\"\n", + " ff_w2_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n", + " compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/alignment/opt_alignment_tests.ipynb b/tests/peft/alignment/opt_alignment_tests.ipynb new file mode 100644 index 0000000000..ca679b1857 --- /dev/null +++ b/tests/peft/alignment/opt_alignment_tests.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os, torch\n", + "from align_test_utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- Attn bias + residual ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "--- MLP ---\n", + "Ok!\n", + "Ok!\n", + "\n", + "--- LM head ---\n", + "Ok!\n", + "Ok!\n", + "\n", + "--- Final Norm ---\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "qProjSize = 64\n", + "num_heads = 12\n", + "num_tokens = 25\n", + "for i in range(tot_num_layers):\n", + " hf_base = os.path.join(hf_path, f\"fwd_step_0_decoder.layers.{i}.\")\n", + " ff_base = os.path.join(ff_path, f\"fwd_step_0_layers_{i}_layers_{i}_\")\n", + " \n", + " # LayerNorm\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n", + " ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.output_0\"\n", + " ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_1\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + "\n", + " # # Attention QKV proj\n", + " # print(\"---Attn---\")\n", + " # ff_tensor = ff_base + \"attention_shard_0_qkv_proj_output\"\n", + " # ff_tensor = load_ff_tensor(ff_tensor, [qProjSize, num_heads, 3, num_tokens])\n", + " # ff_q_proj = ff_tensor[:,:,0,:]\n", + " # ff_k_proj = ff_tensor[:,:,1,:]\n", + " # ff_v_proj = ff_tensor[:,:,2,:]\n", + " # hf_q_proj = hf_base + \"self_attn.q_proj.output_0\"\n", + " # hf_q_proj = load_hf_tensor(hf_q_proj).squeeze().T\n", + " # hf_q_proj = hf_q_proj.reshape(12,64,25)\n", + " # hf_q_proj = np.transpose(hf_q_proj, (1,0,2))\n", + " # hf_k_proj = hf_base + \"self_attn.k_proj.output_0\"\n", + " # hf_k_proj = load_hf_tensor(hf_k_proj).squeeze().T\n", + " # hf_k_proj = hf_k_proj.reshape(12,64,25)\n", + " # hf_k_proj = np.transpose(hf_k_proj, (1,0,2))\n", + " # hf_v_proj = hf_base + \"self_attn.v_proj.output_0\"\n", + " # hf_v_proj = load_hf_tensor(hf_v_proj).squeeze().T\n", + " # hf_v_proj = hf_v_proj.reshape(12,64,25)\n", + " # hf_v_proj = np.transpose(hf_v_proj, (1,0,2))\n", + " # compare_loaded_tensors(hf_q_proj/np.sqrt(qProjSize), ff_q_proj)\n", + " # compare_loaded_tensors(hf_k_proj, ff_k_proj)\n", + " # compare_loaded_tensors(hf_v_proj, ff_v_proj)\n", + "\n", + " # Compare attn bias, residuals\n", + " print(\"--- Attn bias + residual ---\")\n", + " ff_residual1 = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_input_1\"\n", + " ff_residual2 = ff_base + \"attention_layer_norm_shard_0_output_0\"\n", + " compare_flexflow_tensors(ff_residual1, ff_residual2)\n", + " hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n", + " compare_tensors(hf_tensor, ff_residual2)\n", + " ff_tensor = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_output_0\"\n", + " hf_tensor = hf_base + \"final_layer_norm.input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " \n", + " print(\"--- MLP ---\")\n", + " hf_tensor = hf_base + \"fc1.input_0\"\n", + " ff_tensor = ff_base + \"fc1_shard_0_input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + " hf_tensor = hf_base + \"fc2.input_0\"\n", + " ff_tensor = ff_base + \"fc2_shard_0_input_0\"\n", + " compare_tensors(hf_tensor, ff_tensor)\n", + "# LM head\n", + "print(\"\\n--- LM head ---\")\n", + "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.input_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "# Final layer norm\n", + "print(\"\\n--- Final Norm ---\")\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.input_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "ff_tensor1 = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_input_activation\"\n", + "# compare_flexflow_tensors_shortest(ff_tensor, ff_tensor1)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.output_0\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_1\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_1\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_mean\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n", + "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_2\"\n", + "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_rstd\"\n", + "compare_tensors(hf_tensor, ff_tensor)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 22\u001b[0m\n\u001b[1;32m 19\u001b[0m compare_flexflow_tensors(ff_tensor, ff_tensor1)\n\u001b[1;32m 20\u001b[0m compare_tensors(hf_tensor, ff_tensor) \u001b[38;5;66;03m# fails\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# Compare fwd input/output of layernorm\u001b[39;00m\n\u001b[1;32m 25\u001b[0m hf_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_decoder.final_layer_norm.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "# Compare backward pass\n", + "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n", + "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n", + "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n", + "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n", + "\n", + "hf_tensor1 = hf_path + \"/bwd_step_0_decoder.final_layer_norm.go_0\"\n", + "compare_hf_tensors(hf_tensor, hf_tensor1)\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n", + "compare_tensors(hf_tensor1, ff_tensor)\n", + "\n", + "hf_tensor = hf_path + \"/bwd_step_0_decoder.final_layer_norm.gi_0\"\n", + "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_0\"\n", + "ff_tensor1 = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_1\"\n", + "compare_flexflow_tensors(ff_tensor, ff_tensor1)\n", + "compare_tensors(hf_tensor, ff_tensor) # fails" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\n", + "HF: [ 0.0193019 -1.0467215 0.21579844 ... 0.04534929 -0.25642633\n", + " 0.10879952]\n", + "FF:[ 0.01458706 -1.02212262 0.20589906 ... 0.04446212 -0.25625792\n", + " 0.108039 ]\n", + "[ True False True ... True True True]\n", + "[ 1 3 7 ... 19170 19174 19188]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m hf_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 15\u001b[0m ff_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_fc1_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_fc1_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# LORA input\u001b[39;00m\n\u001b[1;32m 20\u001b[0m hf_lora_A_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:32\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "for layer_num in range(tot_num_layers):\n", + " hf_input_ln_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.self_attn_layer_norm.output_0\"\n", + " ff_input_ln_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_layer_norm_shard-id_0_output_1\"\n", + " compare_tensors(hf_input_ln_out, ff_input_ln_out)\n", + " \n", + " hf_ffn_norm_in = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.input_0\"\n", + " ff_ffn_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_0\"\n", + " # compare_tensors(hf_ffn_norm_in, ff_ffn_norm_in)\n", + " \n", + " hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.output_0\"\n", + " ff_ffn_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_1\"\n", + " # compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n", + " hf_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\"\n", + " ff_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\"\n", + " compare_tensors(hf_fc1_in, ff_fc1_in)\n", + "\n", + "\n", + " # LORA input\n", + " hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n", + " ff_lora_A_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n", + " compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n", + " compare_tensors(hf_lora_A_in, ff_lora_A_in)\n", + " # LORA weights\n", + " hf_lora_A_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n", + " ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n", + " compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n", + " hf_lora_B_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n", + " ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n", + " compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n", + " # LORA intermediate hf\n", + " hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n", + " hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n", + " compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n", + " # LORA output\n", + " hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n", + " ff_lora_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n", + " # compare_tensors(hf_lora_out, ff_lora_out)\n", + " # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n", + " # compare_tensors(hf_down_proj_out, ff_lora_out)\n", + " compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n", + " \n", + "\n", + "# After last layer only\n", + "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n", + "ff_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_norm_out, ff_norm_out)\n", + "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n", + "ff_lm_head_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n", + "compare_tensors(hf_lm_head_out, ff_lm_head_out)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ok!\n", + "Ok!\n", + "Ok!\n", + "Ok!\n", + "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.final_layer_norm.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\n", + "HF: [-0.00542103 -1.781267 0.16552497 ... -0.77217525 -0.5760026\n", + " 0.04363118]\n", + "FF:[ 0.03817766 -1.5644939 0.22477378 ... -0.94569921 -0.43960798\n", + " -0.06447437]\n", + "[False False False ... False False False]\n", + "[ 0 1 2 ... 19197 19198 19199]\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 22\u001b[0m\n\u001b[1;32m 20\u001b[0m ff_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 21\u001b[0m ff_FWD_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_FWD_norm_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_FWD_norm_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n\u001b[1;32m 25\u001b[0m hf_BWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_decoder.final_layer_norm.gi_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:29\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m 30\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "tot_num_layers = 12\n", + "\n", + "ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n", + "\n", + "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n", + "ff_BWD_lm_head_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_output_0\"\n", + "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n", + "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n", + "ff_BWD_lm_head_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_input_0\"\n", + "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n", + "\n", + "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.go_0\"\n", + "ff_BWD_norm_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_output_0\"\n", + "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n", + "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n", + "\n", + "# Compare fwd input/output of layernorm\n", + "hf_FWD_norm_in = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.input_0\"\n", + "hf_FWD_norm_out = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.output_0\"\n", + "ff_FWD_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\"\n", + "ff_FWD_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\"\n", + "compare_tensors(hf_FWD_norm_in, ff_FWD_norm_in)\n", + "compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n", + "\n", + "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.gi_0\"\n", + "ff_BWD_norm_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_input_1\"\n", + "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py new file mode 100644 index 0000000000..16b46cfa81 --- /dev/null +++ b/tests/peft/hf_finetune.py @@ -0,0 +1,129 @@ +import os, sys, shutil +import torch + +# Reproducibility +import random +import numpy as np + +torch.manual_seed(0) +random.seed(0) +np.random.seed(0) +# torch.use_deterministic_algorithms(True) + +# import bitsandbytes as bnb +import argparse +import transformers + +if transformers.__version__ < "4.31.0": + raise RuntimeError( + "Please update the transformers library version to 4.31.0 or above" + ) +from datasets import load_dataset + + +from hf_utils import * + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--peft-model-id", type=str, default="goliaro/llama-160m-lora") + parser.add_argument( + "--lora-alpha", + type=int, + default=-1, + help="The scaling coefficient for LoRA. Leave it set to -1 to use the original value from the HF config", + ) + parser.add_argument( + "--lora-dropout", + type=float, + default=0.0, + help="The dropout rate for LoRA. Set it to -1 to use the original value from the HF config", + ) + parser.add_argument("-lr", "--learning-rate", type=float, default=0.001) + parser.add_argument("-n", "--max-steps", type=int, default=2) + parser.add_argument( + "--optimizer", type=str, choices=["sgs", "adam", "adamw"], default="sgd" + ) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--publish-peft-with-id", type=str, default="") + parser.add_argument( + "--save-peft-tensors", + action="store_true", + help="Save PEFT hidden states and weights to file", + ) + args = parser.parse_args() + + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + # Get PEFT config, model, tokenizer, and optimizer type + peft_config = build_peft_config(args, finetuning=True) + tokenizer = get_peft_tokenizer(args, peft_config) + model = build_peft_model(args, peft_config) + optim_type = get_optim_type(args) + + # Print model with PEFT + print(model) + for name, params in model.named_parameters(): + print(name) + print_trainable_parameters(model) + + # Add hooks to save PEFT tensors, save any weights of interest before finetuning + if args.save_peft_tensors: + make_debug_dirs() + register_peft_hooks(model) + save_peft_weights(model, target_modules=["lora", "lm_head", "down_proj"]) + + # Load fine-tuning dataset + data = load_dataset("Abirate/english_quotes") + # TODO: remove using of a single row + key_to_filter = "quote" + desired_value = "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”" + data = filter_dataset_for_debugging(data, key_to_filter, desired_value) + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + # Training loop + trainer = transformers.Trainer( + model=model, + train_dataset=data["train"], + args=transformers.TrainingArguments( + per_device_train_batch_size=1, + gradient_accumulation_steps=1, + max_grad_norm=None, # Disable gradient clipping + warmup_steps=0, + max_steps=args.max_steps, + learning_rate=args.learning_rate, + fp16=True if not args.use_full_precision else False, + logging_steps=1, + output_dir=os.path.join( + args.output_dir if len(args.output_dir) > 0 else "./", + "lora_training_logs", + ), + optim=optim_type, + lr_scheduler_type=transformers.training_args.SchedulerType.CONSTANT, + ), + data_collator=transformers.DataCollatorForLanguageModeling( + tokenizer, mlm=False + ), + callbacks=[HFTrainingCallBack] if args.save_peft_tensors else None, + ) + # silence the warnings. Please re-enable for inference! + model.config.use_cache = False + + # for batch in trainer.get_train_dataloader(): + # print("First batch: ") + # print(batch) + # break + + trainer.train() + + save_finetuned_model(model, args) + + +if __name__ == "__main__": + main() diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py new file mode 100644 index 0000000000..7bfc560cc2 --- /dev/null +++ b/tests/peft/hf_serve.py @@ -0,0 +1,140 @@ +import argparse +import torch +import os, sys, shutil, json +from peft import PeftModel, PeftConfig +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + LlamaTokenizer, + GenerationConfig, +) + + +def peft_pre_forward_hook(module, input): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("base_model.model.model.", "") + print( + f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}" + ) + print("Pre-Input: ", input[0].shape) + torch.save( + input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input" + ) + # print("===") + + +def peft_post_forward_hook(module, input, output): + assert module.name is not None and module.decoding_step is not None + name = module.name.replace("base_model.model.model.", "") + print( + f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}" + ) + print("Post-Input/Output: ", input[0].shape, output[0].shape) + torch.save( + output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output" + ) + print("===") + module.decoding_step += 1 + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--peft-model-id", type=str, required=True) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) + parser.add_argument("--max-length", type=int, default=50) + parser.add_argument("--prompt-file", type=str, required=True) + parser.add_argument("--do-sample", action="store_true", help="Use sampling") + parser.add_argument( + "--save-peft-tensors", + action="store_true", + help="Save PEFT hidden states and weights to file", + ) + args = parser.parse_args() + + # Check if prompt-file exists + if not os.path.isfile(args.prompt_file): + print(f"Error: {args.prompt_file} does not exist.") + return + + # Get peft model config + config = PeftConfig.from_pretrained(args.peft_model_id) + + # Load the base model + model = AutoModelForCausalLM.from_pretrained( + config.base_model_name_or_path, + return_dict=True, + # load_in_8bit=True, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + device_map="auto", + ) + # Load the Lora model + model = PeftModel.from_pretrained(model, args.peft_model_id) + print(model) + + # Get tokenizer + hf_config = AutoConfig.from_pretrained( + config.base_model_name_or_path, trust_remote_code=True + ) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + config.base_model_name_or_path, + use_fast=True, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + config.base_model_name_or_path, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + + # Generation config + generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path) + generation_config.do_sample = args.do_sample + + # Register hooks to save tensors, if needed + if args.save_peft_tensors: + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + # Create output dir + shutil.rmtree("./hf_peft_tensors") + os.makedirs("./hf_peft_tensors", exist_ok=True) + # Save weights + for name, params in model.named_parameters(): + if "lora" in name: + torch.save(params, f"./hf_peft_tensors/{name}") + # params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") + # Save hidden states + for name, layer in dict(model.named_modules()).items(): + if "lora_A.default" in name or "lora_B.default" in name: + layer.name = name + layer.decoding_step = 0 + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_pre_hook(peft_pre_forward_hook) + layer.register_forward_hook(peft_post_forward_hook) + + # Run inference + # Read prompt-file into a list of strings + with open(args.prompt_file, "r") as f: + try: + prompt_list = json.load(f) + except json.JSONDecodeError: + print(f"Error: Unable to parse {args.prompt_file} as JSON.") + sys.exit(1) + + for i, prompt in enumerate(prompt_list): + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) + with torch.cuda.amp.autocast(): + output_tokens = model.generate( + **batch, max_new_tokens=args.max_length, generation_config=generation_config + ) + print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False)) + + +if __name__ == "__main__": + main() diff --git a/tests/peft/hf_train.py b/tests/peft/hf_train.py new file mode 100644 index 0000000000..707fc9d0ae --- /dev/null +++ b/tests/peft/hf_train.py @@ -0,0 +1,161 @@ +import os, sys + +# os.environ["CUDA_VISIBLE_DEVICES"]="0" +import torch +import torch.nn as nn + +# import bitsandbytes as bnb +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer +import argparse +from peft import LoraConfig, get_peft_model +import transformers +from datasets import load_dataset + + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): + return super().forward(x).to(torch.float32) + + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf") + parser.add_argument("--lora-rank", type=int, default=16) + parser.add_argument("--lora-alpha", type=int, default=32) + parser.add_argument( + "--lora-target-modules", + type=str, + default="down_proj", + help="Comma-separated list of layers from the base model to target", + ) + parser.add_argument("--lora-dropout", type=float, default=0.05) + parser.add_argument( + "--use-full-precision", action="store_true", help="Use full precision" + ) + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--publish-peft-with-id", type=str, default="") + args = parser.parse_args() + model_name = args.model_name + use_full_precision = args.use_full_precision + lora_rank = args.lora_rank + lora_alpha = args.lora_alpha + lora_target_modules = args.lora_target_modules.split(",") + lora_dropout = args.lora_dropout + output_dir = args.output_dir + publish_peft_with_id = args.publish_peft_with_id + if len(output_dir) == 0 and len(publish_peft_with_id) == 0: + raise ValueError( + "Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the trained model" + ) + + # Change working dir to folder storing this script + abspath = os.path.abspath(__file__) + dname = os.path.dirname(abspath) + os.chdir(dname) + + model = AutoModelForCausalLM.from_pretrained( + model_name, + # load_in_8bit=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + device_map="auto", + ) + + # Get Tokenizer + hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + model_name, + use_fast=True, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch.float32 if use_full_precision else torch.float16, + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = "[PAD]" + tokenizer.padding_side = "left" + + for param in model.parameters(): + param.requires_grad = False # freeze the model - train adapters later + if param.ndim == 1: + # cast the small parameters (e.g. layernorm) to fp32 for stability + param.data = param.data.to(torch.float32) + + model.gradient_checkpointing_enable() # reduce number of stored activations + model.enable_input_require_grads() + + model.lm_head = CastOutputToFloat(model.lm_head) + + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + # target_modules=["q_proj", "v_proj"], + # target_modules=["down_proj"], + target_modules=lora_target_modules, + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM", + ) + print(model) + print(model.named_parameters()) + model = get_peft_model(model, config) + print_trainable_parameters(model) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = transformers.Trainer( + model=model, + train_dataset=data["train"], + args=transformers.TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=100, + max_steps=200, + learning_rate=2e-4, + fp16=True if not use_full_precision else False, + logging_steps=1, + output_dir=os.path.join( + output_dir if len(output_dir) > 0 else "./", "lora_training_logs" + ), + ), + data_collator=transformers.DataCollatorForLanguageModeling( + tokenizer, mlm=False + ), + ) + model.config.use_cache = ( + False + ) # silence the warnings. Please re-enable for inference! + trainer.train() + + if len(output_dir) > 0: + print(f"Done training! Saving the model to {output_dir}...") + model.save_pretrained(output_dir) + + if len(publish_peft_with_id) > 0: + print( + f"Done training! Uploading the model to HF hub with id: {publish_peft_with_id}..." + ) + model.push_to_hub(publish_peft_with_id, use_auth_token=True) + + +if __name__ == "__main__": + main() diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py new file mode 100644 index 0000000000..9332c803b2 --- /dev/null +++ b/tests/peft/hf_utils.py @@ -0,0 +1,352 @@ +import torch +import torch.nn as nn +import transformers +from transformers import ( + TrainerCallback, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + LlamaTokenizer, +) +import os, shutil +from peft import PeftConfig, PeftModel +from datasets import load_dataset, DatasetDict + +debug_dir = None +debug_subdirs = ["fwd", "bwd", "optim", "weights"] +verbose = False + + +def make_debug_dirs(): + global debug_dir + global debug_subdirs + debug_dir = os.environ.get("FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow")) + debug_dir = os.path.join(debug_dir, "debug", "huggingface") + shutil.rmtree(debug_dir, ignore_errors=True) + os.makedirs(debug_dir, exist_ok=True) + assert debug_dir is not None + assert os.path.isdir(debug_dir) + for subdir in debug_subdirs: + subdir_path = os.path.join(debug_dir, subdir) + os.makedirs(subdir_path, exist_ok=False) + + +def get_dst_folder(subdir, step_idx=0): + global debug_dir, debug_subdirs + assert subdir in debug_subdirs + dst_folder = os.path.join(debug_dir, subdir, f"step_{step_idx}") + os.makedirs(dst_folder, exist_ok=True) + return dst_folder + + +def simplify_name(name): + return name.replace("base_model.model.model.", "").replace("base_model.model.", "") + + +def get_optim_type(args): + if args.optimizer == "sgd": + return transformers.training_args.OptimizerNames.SGD + elif args.optimizer == "adam": + return transformers.training_args.OptimizerNames.ADAM + elif args.optimizer == "adamw": + return transformers.training_args.OptimizerNames.ADAMW + else: + raise ValueError(f"Optimizer {args.optimizer} not supported") + + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): + return super().forward(x).to(torch.float32) + + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + + +def peft_backward_hook(module, grad_input, grad_output): + assert type(grad_input) == tuple and type(grad_output) == tuple + if len(grad_input) == 0 or len(grad_output) == 0: + return + assert module.name is not None and module.bwd_step is not None + name = simplify_name(module.name) + if verbose: + print( + f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}" + ) + print("Backward GRAD Output:") + for i, out_grad in enumerate(grad_output): + if type(out_grad) == torch.Tensor: + dst_folder = get_dst_folder("bwd", module.bwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.output_gradient_{i}") + if verbose: + print("\t", out_grad.shape) + print(f"\t\tSaving to {dst_filepath}") + torch.save(out_grad, dst_filepath) + else: + if verbose: + print(out_grad) + if verbose: + print("Backward GRAD Input:") + for i, in_grad in enumerate(grad_input): + if type(in_grad) == torch.Tensor: + dst_folder = get_dst_folder("bwd", module.bwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.input_gradient_{i}") + if verbose: + print("\t", in_grad.shape) + print(f"\t\tSaving to {dst_filepath}") + torch.save(in_grad, dst_filepath) + else: + if verbose: + print(in_grad) + if verbose: + print("===") + module.bwd_step += 1 + + +def peft_forward_hook(module, input, output): + if len(input) == 0 or len(output) == 0: + return + assert module.name is not None and module.fwd_step is not None + name = simplify_name(module.name) + if verbose: + print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}") + print("Input:") + if type(input) == torch.Tensor: + if verbose: + print(input.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.input_0") + torch.save(input, dst_filepath) + elif type(input) == tuple: + for i, inp in enumerate(input): + if type(inp) == torch.Tensor: + if verbose: + print(inp.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.input_{i}") + torch.save(inp, dst_filepath) + else: + if verbose: + print(inp) + else: + assert False + if verbose: + print("Output:") + if type(output) == torch.Tensor: + if verbose: + print(output.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.output_0") + torch.save(output, dst_filepath) + elif type(output) == tuple: + for i, out in enumerate(output): + if type(out) == torch.Tensor: + if verbose: + print(out.shape) + dst_folder = get_dst_folder("fwd", module.fwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.output_{i}") + torch.save(out, dst_filepath) + else: + if verbose: + print(out) + else: + assert False + if verbose: + print("===") + module.fwd_step += 1 + + +def peft_optimizer_hook(model_, callback_func_handle): + def post_hook(optimizer, args, kwargs): + if verbose: + print("Optimizer Hook activated") + bwd_step = callback_func_handle.step_count + for name_, module in model_.named_modules(): + name = simplify_name(name_) + for param_name, param in module.named_parameters(recurse=False): + if param.requires_grad: + if verbose: + print( + f"Step #{bwd_step}: Saving weight gradient for {name} ({param.grad.shape})" + ) + dst_folder = get_dst_folder("weights", bwd_step) + dst_filepath = os.path.join(dst_folder, f"{name}.gradient") + torch.save(param.grad, dst_filepath) + + return post_hook + + +class HFTrainingCallBack(TrainerCallback): + def on_train_begin(self, args, state, control, **kwargs): + if verbose: + print("Starting finetuning") + model_ = kwargs.get("model", None) + optim = kwargs.get("optimizer", None) + assert model_ is not None + assert optim is not None + self.step_count = 0 + optim.optimizer.register_step_post_hook(peft_optimizer_hook(model_, self)) + + def save_lora_weights(self, model, pre_finetuning=False): + lora_weights_handles = [ + (simplify_name(name), params) + for name, params in model.named_parameters() + if "lora" in name + ] + for simplified_name, params in lora_weights_handles: + dst_folder = get_dst_folder("weights", self.step_count) + if pre_finetuning: + dst_filepath = os.path.join(dst_folder, f"{simplified_name}_original") + torch.save(params, dst_filepath) + if verbose: + print( + f"Step #{self.step_count}: Saving ORIGINAL weight {simplified_name} ({params.shape})" + ) + else: + dst_filepath = os.path.join(dst_folder, f"{simplified_name}_finetuned") + torch.save(params, dst_filepath) + if verbose: + print( + f"Step #{self.step_count}: Saving FINETUNED weight {simplified_name} ({params.shape})" + ) + if not pre_finetuning: + self.step_count += 1 + + def on_step_end( + self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs + ): + self.save_lora_weights(model, pre_finetuning=False) + + def on_step_begin( + self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs + ): + self.save_lora_weights(model, pre_finetuning=True) + + def on_train_end(self, args, state, control, **kwargs): + if verbose: + print(f"Finetuning ended after {self.step_count} steps") + + +def build_peft_config(args, finetuning=False): + peft_config = PeftConfig.from_pretrained(args.peft_model_id) + if peft_config.peft_type != "LORA": + raise ValueError(f"PEFT type {peft_config.peft_type} not supported yet") + if args.lora_alpha > 0.0: + peft_config.lora_alpha = args.lora_alpha + if peft_config.lora_dropout >= 0.0: + peft_config.lora_dropout = args.lora_dropout + # prevent HF from re-inizialing the weights randomly if finetuning + if finetuning: + peft_config.init_lora_weights = False + return peft_config + + +def prepare_model_for_lora_finetuning(model, save_peft_tensors=False): + # Freeze all layers except the LORA ones. Cast small layers to full precision for stability + for name, param in model.named_parameters(): + if "lora" not in name: + param.requires_grad = False # freeze the model - train adapters later + else: + param.requires_grad = True + if param.ndim == 1: + # cast the small parameters (e.g. layernorm) to fp32 for stability + param.data = param.data.to(torch.float32) + if not save_peft_tensors: + model.gradient_checkpointing_enable() # reduce number of stored activations + model.enable_input_require_grads() + model.lm_head = CastOutputToFloat(model.lm_head) + return model + + +def build_peft_model(args, peft_config): + # Load base model, and apply the PEFT layer + model = AutoModelForCausalLM.from_pretrained( + peft_config.base_model_name_or_path, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + device_map="auto", + ) + model = PeftModel.from_pretrained(model, args.peft_model_id, config=peft_config) + model = prepare_model_for_lora_finetuning(model, args.save_peft_tensors) + return model + + +def get_peft_tokenizer(args, peft_config): + # Get Tokenizer + hf_config = AutoConfig.from_pretrained( + peft_config.base_model_name_or_path, trust_remote_code=True + ) + hf_arch = getattr(hf_config, "architectures")[0] + if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": + tokenizer = LlamaTokenizer.from_pretrained( + peft_config.base_model_name_or_path, + use_fast=True, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + peft_config.base_model_name_or_path, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = "[PAD]" + tokenizer.padding_side = "left" + return tokenizer + + +def register_peft_hooks(model): + # Save hidden states and gradients + for name, layer in dict(model.named_modules()).items(): + layer.name = name + layer.fwd_step = 0 + layer.bwd_step = 0 + if verbose: + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_hook(peft_forward_hook) + layer.register_full_backward_hook(peft_backward_hook) + + +def save_peft_weights(model, target_modules=[]): + # Save any weights of interest + for name, params in model.named_parameters(): + simplified_name = simplify_name(name) + for target_module in target_modules: + if target_module in name: + dst_folder = get_dst_folder("weights") + dst_filepath = os.path.join(dst_folder, f"{simplified_name}") + torch.save(params, dst_filepath) + + +def filter_dataset_for_debugging(data, key_to_filter, desired_value): + filtered_dataset_dict = DatasetDict() + for split, dataset in data.items(): + filtered_dataset = dataset.filter( + lambda example: example[key_to_filter] == desired_value + ) + filtered_dataset_dict[split] = filtered_dataset + data = filtered_dataset_dict + return data + + +def save_finetuned_model(model, args): + if len(args.output_dir) > 0: + if verbose: + print(f"Saving the model to {args.output_dir}...") + model.save_pretrained(args.output_dir) + + if len(args.publish_peft_with_id) > 0: + if verbose: + print( + f"Uploading the model to HF hub with id: {args.publish_peft_with_id}..." + ) + model.push_to_hub(args.publish_peft_with_id, use_auth_token=True) diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py new file mode 100644 index 0000000000..266bb64137 --- /dev/null +++ b/tests/peft/peft_alignment_test.py @@ -0,0 +1,730 @@ +import numpy as np +import os, torch, argparse +from alignment.align_test_utils import * +from transformers import AutoConfig +from peft import PeftConfig +from tqdm import tqdm + +class AlignmentTest: + def __init__(self, model_name, tp_degree=1): + raise NotImplementedError() + def check_weights_alignment(self): + raise NotImplementedError() + def check_fwd_pass(self): + raise NotImplementedError() + def check_bwd_pass(self): + raise NotImplementedError() + def check_step(self, step_idx, learning_rate=0.001): + raise NotImplementedError() + +class LllamaAlignmentTest(AlignmentTest): + def __init__(self, model_name, tp_degree=1): + self.model_name = model_name + self.peft_config = PeftConfig.from_pretrained(model_name) + self.hf_config = AutoConfig.from_pretrained(self.peft_config.base_model_name_or_path) + self.num_layers = self.hf_config.num_hidden_layers + self.hidden_size = self.hf_config.hidden_size + self.intermediate_size = self.hf_config.intermediate_size + self.num_attention_heads = self.hf_config.num_attention_heads + self.num_key_value_heads = self.num_attention_heads + self.projsize = self.hidden_size // self.num_attention_heads + self.tp_degree = tp_degree + self.lora_scaling_factor = self.peft_config.lora_alpha / self.peft_config.r + + self.num_tokens = None + self.ff_batch_size = None + + + def check_weights_alignment(self): + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "lm_head.weight": + f_version = f"layers.{self.num_layers-1}.lm_head.weight_0" + elif hf_filename == "norm.weight": + f_version = f"layers.{self.num_layers-1}.norm.weight_0" + else: + f_version = "" + if hf_filename.startswith("layers."): + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version += f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # compute weight index, then rename lora if needed if needed + weight_index="0" + if "lora_A" in f_version: + weight_index="A" + elif "lora_B" in f_version: + weight_index="B" + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + if f_version.endswith(".weight"): + if weight_index == "0": + f_version += f"_{weight_index}" + else: + f_version += f"_{weight_index}.original" + elif f_version.endswith(".gradient"): + prefix = f_version.split(".gradient")[0] + f_version = prefix + f".weight_{weight_index}.gradient" + return f_version + def get_tp_partition_dim(ff_weight_name) -> int: + # MLP layers split the intermediate size dimension + # gate_proj, up_proj: [hidden_size, intermediate_size] + # down_proj: [intermediate_size, hidden_size] + if self.tp_degree == 1: + return -1 + if "lora.weight_B" in ff_weight_name: + return -1 + if "lm_head" in ff_weight_name or "norm" in ff_weight_name: + return 1 + if "gate_proj" in ff_weight_name or "up_proj" in ff_weight_name: + return 1 + elif "down_proj" in ff_weight_name: + return 0 + else: + return -1 + print("-- Weights alignment --") + hf_weights_folder = os.path.join(hf_path, "weights", "step_0") + ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0") + files_list = os.listdir(hf_weights_folder) + for hf_weight_name in tqdm(sorted(files_list)): + if hf_weight_name.endswith(".weight"): + ff_weight_name = convert_hf_filename_to_ff(hf_weight_name) + # print(hf_weight_name, ff_weight_name) + hf_w_path = os.path.join(hf_weights_folder, hf_weight_name) + ff_w_path = os.path.join(ff_weights_folder, ff_weight_name) + if not os.path.isfile(hf_w_path): + print(f"File '{hf_w_path}' not found") + if not os.path.isfile(ff_w_path): + print(f"File '{ff_w_path}' not found") + assert(os.path.isfile(hf_w_path)) + assert(os.path.isfile(ff_w_path)) + + # 1. get shape of hf weight + hf_weight = torch.load(hf_w_path, map_location='cpu') + hf_weigth_shape = hf_weight.shape + ff_partition_dim = get_tp_partition_dim(ff_weight_name) + ff_weigth_shape = list(hf_weigth_shape)[::-1] + if ff_partition_dim >= 0: + ff_weigth_shape[ff_partition_dim] //= self.tp_degree + + # 2. handle flexflow shards in case of tensor parallelism + ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weigth_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + if ff_partition_dim >= 0: + ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim) + else: + assert(are_np_arrays_identical(ff_weights)) + ff_weight = ff_weights[0] + else: + ff_weight = ff_weights[0] + ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype) + + # check equivalence + try: + torch.testing.assert_close(ff_weight, hf_weight.T) + except Exception as e: + print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n") + raise e + + def check_fwd_pass(self, step_idx=0): + hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}") + ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") + + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "embed_tokens": + f_version = f"layers.0.embed_tokens" + elif hf_filename == "lm_head" or hf_filename == "norm": + f_version = f"layers.{self.num_layers-1}.{hf_filename}" + else: + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix + f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "") + # lora in HuggingFace is split into A and B operators, in FF we use a single operator. + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + return f_version + + def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): + hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}" + hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + if hf_tensor_name == "embed_tokens": + self.num_tokens = hf_tensor.shape[1] + return hf_tensor + + def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE): + ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else "" + ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else "" + ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}" + ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename) + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[0] //= self.tp_degree + + if "layers.0.embed_tokens.input_0" in ff_tensor_path: + # get number of tokens + ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',') + self.ff_batch_size = ff_tensor.shape[0] + + ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=0) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=0) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens) + return ff_tensor + + def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + if additional_ff_tensor is not None: + additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype) + ff_tensor = ff_tensor - additional_ff_tensor + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance) + if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print("FF tensor:") + print(ff_tensor.squeeze()) + raise e + + print(f"-- FWD pass {step_idx}--") + + # Embedding layer + hf_tensor_name = "embed_tokens" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding output") + + # Transformers blocks + for i in range(self.num_layers): + # Input laye norm + hf_tensor_name = f"layers.{i}.input_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + if i == 0: + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + else: + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} output") + + # Attention + hf_tensor_name = f"layers.{i}.self_attn.o_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label=f"Attention {i} output") + + # Post-attention layernorm + hf_tensor_name = f"layers.{i}.post_attention_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Post-attention layernorm {i} output") + + # W1 (gate_proj) + hf_tensor_name = f"layers.{i}.mlp.gate_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W1 {i} output") + + # W3 (up_proj) + hf_tensor_name = f"layers.{i}.mlp.up_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W3 {i} output") + + # W2 (down_proj) + hf_tensor_name = f"layers.{i}.mlp.down_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W2 {i} input") + + hf_down_proj_in = hf_tensor.clone() + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + + # LoRA_A + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} input") + torch.testing.assert_close(hf_down_proj_in, hf_tensor, rtol=1.3e-6, atol=1e-5) + + # LoRA intermediate + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="low_rank_activation", hf_tensor_idx=0, ff_tensor_idx=None) + hf_lora_A_out = get_hf_tensor(hf_tensor_name, output_comparison) + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" + hf_lora_B_in = get_hf_tensor(hf_tensor_name, input_comparison) + torch.testing.assert_close(hf_lora_A_out, hf_lora_B_in, rtol=1.3e-6, atol=1e-5) + ff_tensor_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora" + ff_lora_A_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_lora_A_out.shape, tp_type=TPType.TO_REDUCE) + compare(hf_lora_A_out, ff_lora_A_out, label=f"LoRA_A {i} output") + + # LoRA_B + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) * self.lora_scaling_factor + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_down_proj_out.shape, tp_type=TPType.TO_REDUCE) + compare(hf_down_proj_out, ff_tensor, label=f"W2_out + scaling*LoRA_B_out {i}") + compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_down_proj_out, label=f"LoRA_B {i} output") + + # Norm + hf_tensor_name = "norm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Norm output") + + # LM head + hf_tensor_name = "lm_head" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label="LM head input") + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label="LM head output") + + def check_bwd_pass(self, step_idx=0): + if not self.num_tokens or not self.ff_batch_size: + raise ValueError("Number of tokens and batch size must be set before running backward pass check") + hf_bwd_folder = os.path.join(hf_path, "bwd", f"step_{step_idx}") + ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0") + + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "embed_tokens": + f_version = f"layers.0.embed_tokens" + elif hf_filename == "lm_head" or hf_filename == "norm": + f_version = f"layers.{self.num_layers-1}.{hf_filename}" + else: + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix + # f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "") + # lora in HuggingFace is split into A and B operators, in FF we use a single operator. + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + return f_version + + def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): + hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}" + hf_tensor_path = os.path.join(hf_bwd_folder, hf_tensor_filename) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + return hf_tensor + + def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE, pre=False, shard_axis=0): + ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else "" + ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else "" + ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}" + + ff_tensor_path = os.path.join(ff_bwd_folder, ff_tensor_filename) + if pre: + ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre") + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[shard_axis] //= self.tp_degree + + # exception: intermediate attention tensors + intermediate_attention_tensor = ( + "self_attn" in ff_tensor_name and + not ( + ff_tensor_name.endswith(".self_attn") and + ( + tensor_comparison_idx.ff_tensor_type == "output_gradient" or + tensor_comparison_idx.ff_tensor_type == "input_gradient" + ) + ) + ) + if not intermediate_attention_tensor: + ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) + + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=shard_axis) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=shard_axis) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + if not intermediate_attention_tensor: + ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens) + return ff_tensor + + def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-3): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + if additional_ff_tensor is not None: + additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype) + ff_tensor = ff_tensor - additional_ff_tensor + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance) + if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .06 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print("FF tensor:") + print(ff_tensor.squeeze()) + raise e + + print(f"-- BWD pass {step_idx}--") + + # LM head + hf_tensor_name = "lm_head" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label="LM head gradient output") + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label="LM head gradient input") + + # Norm + hf_tensor_name = "norm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label="Norm gradient output") + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Norm gradient input") + + # Transformers blocks + for i in range(self.num_layers-1, -1, -1): + # W2 (down_proj) output + hf_tensor_name = f"layers.{i}.mlp.down_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output") + + # LoRA_B + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor + compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output") + + # LoRA_A + hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input") + + # W2 (down_proj) input + hf_tensor_name = f"layers.{i}.mlp.down_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input") + + # W2 input (HF) and SigmoidSiluMulti output (FF) + hf_w2_input = hf_tensor.clone() + ff_tensor_name = f"layers.{i}.SigmoidSiluMulti" + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_w2_input, ff_tensor, label=f"HF W2 {i} output and FF SSM output") + + # W1 (gate_proj) output + hf_tensor_name = f"layers.{i}.mlp.gate_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W1 {i} gradient output") + # W1 (gate_proj) input + # HF W1 in = FF W1 in - HF W1 in (pre) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + ff_tensor_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE, pre=True) + compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_tensor_pre, label=f"W1 {i} gradient input") + + # W3 (up_proj) output + hf_tensor_name = f"layers.{i}.mlp.up_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output") + # W3 (up_proj) input + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input") + + # Attn O-proj + hf_tensor_name = f"layers.{i}.self_attn.o_proj" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output") + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj" + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient input") + + # V-proj grads + # FF shape: [num_tokens, qProjSize*num_heads] + hf_tensor_name = f"layers.{i}.self_attn.v_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + mixed_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, mixed_comparison) + hf_tensor = hf_tensor.squeeze().T + ff_tensor = get_ff_tensor(ff_tensor_name, mixed_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=1) + compare(hf_tensor, ff_tensor, label=f"V-proj {i} gradient input") + + # K-proj grads + # FF shape: (num_tokens, qProjSize, num_heads) + hf_tensor_name = f"layers.{i}.self_attn.k_proj" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + k_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="devkproj", hf_tensor_idx=0, ff_tensor_idx=None) + hf_tensor = get_hf_tensor(hf_tensor_name, k_proj_comparison) + hf_tensor = hf_tensor.squeeze().view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous() + hf_tensor = hf_tensor.T + ff_tensor = get_ff_tensor(ff_tensor_name, k_proj_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=2) + compare(hf_tensor, ff_tensor, label=f"K-proj {i} gradient input") + + # Q-proj grads + # FF shape (devQKVPRojArray): (num_tokens, qProjSize, num_heads, 3) + # Q-proj out grad: devQKVPRojArray[:,:,:,0] + hf_tensor_name = f"layers.{i}.self_attn.q_proj" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.devQKVPRojArray" + q_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="", hf_tensor_idx=0, ff_tensor_idx=None) + hf_tensor = get_hf_tensor(hf_tensor_name, q_proj_comparison) + hf_tensor = hf_tensor.view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous().T + augmented_hf_tensor_shape = torch.Size([3]+list(hf_tensor.size())) + ff_tensor = get_ff_tensor(ff_tensor_name, q_proj_comparison, augmented_hf_tensor_shape, tp_type=TPType.PARTITION, shard_axis=2)[:,:,:,0] + compare(hf_tensor, ff_tensor, label=f"Q-proj {i} gradient input") + + # FF Attn input with HF layernorm out + hf_tensor_name = f"layers.{i}.input_layernorm" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + input_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + compare(hf_tensor, ff_tensor, label=f"Attn input {i} gradient input") + + if i > 0: + # FF attn input with FF layernorm out 1 + attn_input = ff_tensor.clone() + ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm" + _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1) + input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5) + + # Input layernorm + + hf_tensor_name = f"layers.{i}.input_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) + ff_in1_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=1) + input_layernorm0 = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + input_layernorm1 = get_ff_tensor(ff_tensor_name, ff_in1_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + torch.testing.assert_close(input_layernorm0, input_layernorm1, rtol=1.3e-6, atol=1e-5) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + # if i > 1: + # compare(hf_tensor, input_layernorm1, label=f"Input layernorm {i} gradient input") + + def check_step(self, step_idx=0, learning_rate=0.001): + hf_weight_folder = os.path.join(hf_path, "weights", f"step_{step_idx}") + ff_weight_folder = os.path.join(ff_path, "weights", f"step_{step_idx}", "shard_0") + def convert_hf_filename_to_ff(hf_filename): + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # lora in HuggingFace is split into A and B operators, in FF we use a single operator. + f_version = f_version.replace("lora_A", "lora.weight_A").replace("lora_B", "lora.weight_B") + return f_version + def get_hf_tensor(hf_tensor_name): + hf_tensor_path = os.path.join(hf_weight_folder, hf_tensor_name) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + return hf_tensor + def get_ff_tensor(ff_tensor_name, hf_shape, tp_type=TPType.REPLICATE, pre=False): + ff_tensor_path = os.path.join(ff_weight_folder, ff_tensor_name) + if pre: + ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre") + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[0] //= self.tp_degree + + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=0) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=0) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + return ff_tensor + def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance) + if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print("FF tensor:") + print(ff_tensor.squeeze()) + raise e + print(f"-- optimizer pass {step_idx}--") + + for i in range(self.num_layers-1, -1, -1): + # LoRA_B gradient + hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_B.default.gradient" + hf_gradient = get_hf_tensor(hf_gradient_name) + hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_original" + hf_original_weight = get_hf_tensor(hf_original_weight_name) + hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_finetuned" + hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name) + torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5) + ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) + ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE) + compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient") + # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0" + # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") + # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0") + # ff_out_gradient = load_ff_tensor(os.path.join(ff_bwd_folder, ff_out_gradient_name), [self.hidden_size, 128])[:,:self.num_tokens] + # ff_out_gradient = torch.from_numpy(ff_out_gradient) + # print("Output gradient shape: ", ff_out_gradient.shape) + # ff_low_rank_activation = f"layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation" + # ff_low_rank_activation = load_ff_tensor(os.path.join(ff_fwd_folder, ff_low_rank_activation), [16, 128])[:,:self.num_tokens] + # ff_low_rank_activation = torch.from_numpy(ff_low_rank_activation) + # print("Low rank activation shape: ", ff_low_rank_activation.shape) + # simulated_weight_grad = ff_low_rank_activation @ ff_out_gradient.T + # print("Simulated weight grad shape: ", simulated_weight_grad.shape) + # print(simulated_weight_grad) + # print(ff_gradient) + # compare(hf_gradient, simulated_weight_grad, label=f"LoRA_B {i} simulated gradient") + + + # LoRA_A gradient + hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_A.default.gradient" + hf_gradient = get_hf_tensor(hf_gradient_name) + ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) + hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_original" + hf_original_weight = get_hf_tensor(hf_original_weight_name) + hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_finetuned" + hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name) + torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5) + ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) + ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.PARTITION) + compare(hf_gradient, ff_gradient, label=f"LoRA_A {i} gradient") + +parser = argparse.ArgumentParser(description='Argument Parser Example') +# Adding arguments +parser.add_argument('-m', '--model-name', type=str, default="goliaro/llama-160m-lora", help='Name of the model') +parser.add_argument('-n', '--num-steps', type=int, default=1, help='Number of finetuning steps') +parser.add_argument('-tp', '--tensor-parallelism-degree', type=int, default=1, help='The tensor parallelism degree used when running FlexFlow') +parser.add_argument('-lr', '--learning-rate', type=float, default=0.001, help='The learning rate used at finetuning time') + +# Parse the arguments from command line +args = parser.parse_args() + +if __name__ == "__main__": + llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree) + # llama_alignment.check_weights_alignment() + for i in range(args.num_steps): + llama_alignment.check_fwd_pass(i) + llama_alignment.check_bwd_pass(i) + llama_alignment.check_step(i, args.learning_rate) diff --git a/tests/peft_test.sh b/tests/peft_test.sh new file mode 100755 index 0000000000..5600d57edf --- /dev/null +++ b/tests/peft_test.sh @@ -0,0 +1,66 @@ +#! /usr/bin/env bash +# set -x +set -e + +cleanup() { + rm -rf ~/.cache/flexflow/debug +} + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}/.." + +# Token to access private huggingface models (e.g. LLAMA-2) +HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none} +if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then + huggingface-cli login --token "$HUGGINGFACE_TOKEN" +fi + +# Clean up before test (just in case) +cleanup + +# Create test prompt file +mkdir -p ./inference/prompt +echo '["Two things are infinite: "]' > ./inference/prompt/peft.json +echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.”"]' > ./inference/prompt/peft_dataset.json + + +# Create output folder +mkdir -p ./inference/output + +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 + +# Download test model +python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m + +# Run PEFT in Huggingface to get ground truth tensors +python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision + +# Python test +echo "Python test" +python ./inference/python/ff_peft.py +# Check alignment +python ./tests/peft/peft_alignment_test.py -tp 2 + +# C++ test +echo "C++ test" +./build/inference/peft/peft \ + -ll:gpu 2 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 2 \ + -ll:fsize 8192 -ll:zsize 12000 \ + -llm-model JackFram/llama-160m \ + -finetuning-dataset ./inference/prompt/peft_dataset.json \ + -peft-model goliaro/llama-160m-lora \ + -enable-peft \ + --use-full-precision \ + --inference-debugging +# Check alignment +python ./tests/peft/peft_alignment_test.py -tp 2 + +# Print succeess message +echo "" +echo "PEFT tests passed!" +echo "" + +# Cleanup after the test +cleanup From 0ba7c9f1a90fa4ae2b800fd852194e7b7d15dca8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 21 Sep 2024 12:41:04 -0700 Subject: [PATCH 21/44] Update nccl (#1507) * update nccl * fix * update --------- Co-authored-by: Ubuntu --- cmake/nccl.cmake | 200 +++++++++++++++------------------------ deps/nccl | 2 +- docker/run.sh | 12 +-- tests/inference_tests.sh | 3 - 4 files changed, 81 insertions(+), 136 deletions(-) diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake index c140a44ec8..82cf3b4122 100644 --- a/cmake/nccl.cmake +++ b/cmake/nccl.cmake @@ -2,140 +2,88 @@ set(NCCL_NAME nccl) # set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}") # message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}") -set(NCCL_URL "") -if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") - if(LINUX_VERSION MATCHES "20.04") - if (CUDA_VERSION VERSION_EQUAL "11.0") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.1") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.2") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.3") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.4") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.5") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.6") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.7") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz") - endif() - elseif(LINUX_VERSION MATCHES "18.04") - if (CUDA_VERSION VERSION_EQUAL "10.1") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz") - elseif (CUDA_VERSION VERSION_EQUAL "10.2") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz") - elseif (CUDA_VERSION VERSION_EQUAL "11.0") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.1") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.2") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.3") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.4") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.5") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.6") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz") - elseif(CUDA_VERSION VERSION_EQUAL "11.7") - set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz") - endif() - endif() +if(NCCL_PATH) + set(NCCL_ROOT ${NCCL_PATH}) +else() + # if NCCL_PATH is not set, let's try to find it in the CUDA root + set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) endif() -if(NCCL_URL) - # Download and import pre-compiled NCCL library - message(STATUS "Using pre-compiled NCCL library") - message(STATUS "NCCL_URL: ${NCCL_URL}") +find_library(NCCL_LIBRARY + NAMES libnccl${LIBEXT} + PATHS ${NCCL_ROOT} ${CUDA_ROOT} + PATH_SUFFIXES lib lib64 + DOC "NCCL library." ) - include(FetchContent) - FetchContent_Declare(${NCCL_NAME} - URL ${NCCL_URL} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - ) - FetchContent_GetProperties(${NCCL_NAME}) - if(NOT ${NCCL_NAME}_POPULATED) - FetchContent_Populate(${NCCL_NAME}) - endif() - - set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/${NCCL_NAME}) - set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include) - set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib) - message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}") - add_library(nccl SHARED IMPORTED) - set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH}) +find_path(NCCL_INCLUDE_DIR + NAMES nccl.h + HINTS ${NCCL_ROOT} + PATH_SUFFIXES include + DOC "NCCL include directory.") - list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) - list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT}) - install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include) - install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE) - -else() - if(NCCL_PATH) - set(NCCL_ROOT ${NCCL_PATH}) +# find NCCL, set NCCL lib and include +if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR) + set(NCCL_FOUND ON) + set(NCCL_LIBRARIES ${NCCL_LIBRARY}) + set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) + + # Check NCCL version + if(EXISTS "${NCCL_INCLUDE_DIR}/nccl.h") + file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES + REGEX "#define NCCL_MAJOR [0-9]+" ) + file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES2 + REGEX "#define NCCL_MINOR [0-9]+" ) + string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES}) + string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2}) + set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}") + if(NCCL_VERSION VERSION_LESS 2.23) + set(NCCL_OLD TRUE) + else() + set(NCCL_OLD FALSE) + endif() + message(STATUS "Found NCCL version: ${NCCL_VERSION}") else() - # if NCCL_PATH is not set, let's try to find it in the CUDA root - set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) + message(WARNING "NCCL header not found, unable to determine version") + set(NCCL_OLD TRUE) # Assume old version if we can't determine endif() - - find_library(NCCL_LIBRARY - NAMES libnccl${LIBEXT} - PATHS ${NCCL_ROOT} ${CUDA_ROOT} - PATH_SUFFIXES lib lib64 - DOC "NCCL library." ) +endif() - find_path(NCCL_INCLUDE_DIR - NAMES nccl.h - HINTS ${NCCL_ROOT} - PATH_SUFFIXES include - DOC "NCCL include directory.") - - # find NCCL, set NCCL lib and include - if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR) - set(NCCL_FOUND ON) - set(NCCL_LIBRARIES ${NCCL_LIBRARY}) - set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) - endif() - - # find NCCL - if(NCCL_FOUND) - list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES}) - list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS}) - message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" ) - message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" ) - add_library(nccl SHARED IMPORTED) - - # Build NCCL from source - else() - message(STATUS "Building NCCL from source") - list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE) - - ExternalProject_Add(${NCCL_NAME} - SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME} - PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} - INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} - BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT} - INSTALL_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}" - BUILD_IN_SOURCE 1 - ) +# find NCCL +if(NCCL_FOUND AND (NOT NCCL_OLD OR CUDA_VERSION VERSION_LESS 12.0)) + list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES}) + list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS}) + message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" ) + message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" ) + add_library(nccl SHARED IMPORTED) + +# Build NCCL from source +else() + message(STATUS "Building NCCL from source") + list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE) - ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR) - message(STATUS "NCCL install dir: ${INSTALL_DIR}") - list(APPEND FLEXFLOW_INCLUDE_DIRS - ${INSTALL_DIR}/include) - list(APPEND FLEXFLOW_EXT_LIBRARIES - ${INSTALL_DIR}/lib/libnccl${LIBEXT}) - set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/") - - install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include) - install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE) + set(NCCL_BUILD_CMD make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}") + if(DEFINED ENV{MAKEFLAGS}) + set(NCCL_BUILD_CMD ${CMAKE_COMMAND} -E env MAKEFLAGS=$ENV{MAKEFLAGS} ${NCCL_BUILD_CMD}) endif() + ExternalProject_Add(${NCCL_NAME} + SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME} + PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} + INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME} + BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT} + INSTALL_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND ${NCCL_BUILD_CMD} + BUILD_IN_SOURCE 1 + ) + ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR) + message(STATUS "NCCL install dir: ${INSTALL_DIR}") + list(APPEND FLEXFLOW_INCLUDE_DIRS + ${INSTALL_DIR}/include) + list(APPEND FLEXFLOW_EXT_LIBRARIES + ${INSTALL_DIR}/lib/libnccl${LIBEXT}) + set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/") + + install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include) + install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE) endif() diff --git a/deps/nccl b/deps/nccl index 6e24ef4e1f..2ea4ee94bf 160000 --- a/deps/nccl +++ b/deps/nccl @@ -1 +1 @@ -Subproject commit 6e24ef4e1f1eac9f104d115ef65429f179924ee7 +Subproject commit 2ea4ee94bfb04c886c79ccae60ac9961000fdee2 diff --git a/docker/run.sh b/docker/run.sh index cf105a10c8..cdf9383052 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -18,8 +18,6 @@ ATTACH_GPUS=${ATTACH_GPUS:-true} gpu_arg="" if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi -# Whether to attach inference weights / files (make sure to download the weights first) -ATTACH_INFERENCE_FILES=${ATTACH_INFERENCE_FILES:-false} # Amount of shared memory to give the Docker container access to # If you get a Bus Error, increase this value. If you don't have enough memory @@ -115,9 +113,11 @@ if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":lat exit 1 fi -inference_volumes="" -if $ATTACH_INFERENCE_FILES ; then - inference_volumes="-v ~/.cache/flexflow:/usr/FlexFlow/inference"; +hf_token_volume="" +hf_token_path="$HOME/.cache/huggingface/token" +if [ -f "$hf_token_path" ]; then + # If the token exists, add the volume mount to the Docker command + hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token" fi -eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${inference_volumes}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" +eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index 895b74c798..d173cce06d 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -25,9 +25,6 @@ fi # Clean up before test (just in case) cleanup -# Make sure supported version of protobuf is installed -pip3 install protobuf==3.20.3 - # Create test prompt file mkdir -p ../inference/prompt echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json From 1f6350faaa06d9aa9c3a2ed963355fe4fe7876c7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 22 Sep 2024 18:21:22 -0400 Subject: [PATCH 22/44] speedup docker builds --- docker/flexflow-environment/Dockerfile | 38 ++++++++++++++++++-------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 3434916d6b..ee13a07375 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -42,17 +42,38 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ /opt/conda/bin/conda install conda-build conda-verify && \ /opt/conda/bin/conda clean -ya -# Optionally install HIP dependencies +# set MAKEFLAGS to speedup any dependency that uses make +ARG N_BUILD_CORES +ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}" + +# Set env vars +ENV PATH /opt/conda/bin:$PATH +ENV CUDNN_DIR /usr/local/cuda +ENV CUDA_DIR /usr/local/cuda + +# GPU-specific dependencies +ARG FF_GPU_BACKEND "cuda" + +# Update NCCL if FF_GPU_BACKEND is cuda +RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \ + echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \ + ubuntu_version=$(lsb_release -rs); \ + ubuntu_version=${ubuntu_version//./}; \ + wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \ + DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \ + DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \ + rm -f cuda-keyring_1.0-1_all.deb; \ + DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \ + else \ + echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \ + fi' + +# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This # package attempts to re-install cuda even though cuda is already installed # in the container. It also attempts to install packages for a graphical install. # For our container, we don't need `hip-runtime-nvidia` -ARG FF_GPU_BACKEND "cuda" ARG hip_version "5.6" -ARG N_BUILD_CORES -# set MAKEFLAGS to speedup any dependency that uses make -ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}" - RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \ # Check that hip_version is one of 5.3,5.4,5.5,5.6 @@ -83,11 +104,6 @@ RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ] fi RUN rm -rf /var/lib/apt/lists/* -# Set env vars -ENV PATH /opt/conda/bin:$PATH -ENV CUDNN_DIR /usr/local/cuda -ENV CUDA_DIR /usr/local/cuda - # Install python packages and other dependencies RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing # Install CPU-only Pytorch and related dependencies From 2e363c4955f2f80e965db4e2837b709597e83fe8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sun, 22 Sep 2024 18:23:00 -0400 Subject: [PATCH 23/44] update --- docker/flexflow/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile index 60f9d4d653..dff9259657 100644 --- a/docker/flexflow/Dockerfile +++ b/docker/flexflow/Dockerfile @@ -27,9 +27,7 @@ RUN for pair in $BUILD_CONFIGS; do \ # Build and install C++ and Python versions of FlexFlow RUN mkdir -p build && cd build && \ eval "$BUILD_CONFIGS" ../config/config.linux && \ - make -j $N_BUILD_CORES && \ - eval "$BUILD_CONFIGS" ../config/config.linux && \ - make install && \ + make -j $N_BUILD_CORES install && \ ldconfig ENTRYPOINT ["/bin/bash"] From 70e47b286370d2ff5feeb7949311881b987c0ac8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 25 Sep 2024 19:07:54 +0000 Subject: [PATCH 24/44] remove outdated code --- src/ops/residual_layer_norm.cpp | 21 ++++++++++++--------- src/ops/residual_layer_norm.cu | 21 ++++++++++++--------- src/runtime/cuda_helper.cu | 16 ++++++++++++++++ 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index 582e0752ef..ed973b4f71 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -176,6 +176,8 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, beta_ptr, output_ptr); } + +#ifdef DEADCODE template void save_inference_tensors(ResidualLayerNormMeta const *m) { if (m->inference_debugging) { @@ -206,6 +208,7 @@ void save_inference_tensors(ResidualLayerNormMeta const *m) { filename3.c_str()); } } +#endif /*static*/ void ResidualLayerNorm::inference_kernel_wrapper( @@ -314,15 +317,15 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } - if (m->inference_debugging) { - if (m->input_type[0] == DT_FLOAT) { - save_inference_tensors(m); - } else if (m->input_type[0] == DT_HALF) { - save_inference_tensors(m); - } else { - assert(false && "unsupport datatype in layernorm"); - } - } + // if (m->inference_debugging) { + // if (m->input_type[0] == DT_FLOAT) { + // save_inference_tensors(m); + // } else if (m->input_type[0] == DT_HALF) { + // save_inference_tensors(m); + // } else { + // assert(false && "unsupport datatype in layernorm"); + // } + // } if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 8cdf87a92c..50c81d2099 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -174,6 +174,8 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m, beta_ptr, output_ptr); } + +#ifdef DEADCODE template void save_inference_tensors(ResidualLayerNormMeta const *m) { if (m->inference_debugging) { @@ -204,6 +206,7 @@ void save_inference_tensors(ResidualLayerNormMeta const *m) { filename3.c_str()); } } +#endif /*static*/ void ResidualLayerNorm::inference_kernel_wrapper( @@ -312,15 +315,15 @@ void ResidualLayerNorm::inference_kernel_wrapper( } } - if (m->inference_debugging) { - if (m->input_type[0] == DT_FLOAT) { - save_inference_tensors(m); - } else if (m->input_type[0] == DT_HALF) { - save_inference_tensors(m); - } else { - assert(false && "unsupport datatype in layernorm"); - } - } + // if (m->inference_debugging) { + // if (m->input_type[0] == DT_FLOAT) { + // save_inference_tensors(m); + // } else if (m->input_type[0] == DT_HALF) { + // save_inference_tensors(m); + // } else { + // assert(false && "unsupport datatype in layernorm"); + // } + // } if (m->profiling) { cudaEventRecord(t_end, stream); diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu index 386a0c940b..42b3946f8c 100644 --- a/src/runtime/cuda_helper.cu +++ b/src/runtime/cuda_helper.cu @@ -278,6 +278,10 @@ __host__ void host_ptr, ptr, sizeof(float) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); + if (!tensor_file) { + fprintf(stderr, "Error %i creating file %s\n", errno, file_name); + assert(false); + } assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { if (i < num_elements - 1) { @@ -299,6 +303,10 @@ __host__ void host_ptr, ptr, sizeof(half) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); + if (!tensor_file) { + fprintf(stderr, "Error %i creating file %s\n", errno, file_name); + assert(false); + } assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { if (i < num_elements - 1) { @@ -321,6 +329,10 @@ __host__ void save_tensor(int32_t const *ptr, host_ptr, ptr, sizeof(int32_t) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); + if (!tensor_file) { + fprintf(stderr, "Error %i creating file %s\n", errno, file_name); + assert(false); + } assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { if (i < num_elements - 1) { @@ -343,6 +355,10 @@ __host__ void save_tensor(int64_t const *ptr, host_ptr, ptr, sizeof(int64_t) * num_elements, cudaMemcpyDeviceToHost)); FILE *tensor_file; tensor_file = fopen(file_name, "w"); + if (!tensor_file) { + fprintf(stderr, "Error %i creating file %s\n", errno, file_name); + assert(false); + } assert(tensor_file != NULL); for (unsigned i = 0; i < num_elements; i++) { if (i < num_elements - 1) { From 9da554607063c3b17211238b3cc0e589d2cc50d9 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Wed, 25 Sep 2024 17:30:32 -0700 Subject: [PATCH 25/44] [Bug Fix] Update register interface (#1509) * minor bug fix * assign static variant ID to avoid mismatch between ranks --- src/runtime/model.cc | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f46630db3c..ceb9277b76 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -7443,12 +7443,13 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "Adam Parameter Server Update Task"); + registrar, "Adam Parameter Server Update Task", 111 /*variant ID*/); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant( + registrar, 111 /*variant ID*/); } } #ifdef FF_USE_NCCL @@ -7459,12 +7460,13 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "SGD NCCL Update Task"); + registrar, "SGD NCCL Update Task", 111 /*variant ID*/); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant( + registrar, 111 /*variant ID*/); } } { @@ -7473,13 +7475,13 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "Adam NCCL Update Task"); + registrar, "Adam NCCL Update Task", 111 /*variant ID*/); } else { if (enable_control_replication) { registrar.global_registration = false; } runtime->register_task_variant( - registrar); + registrar, 111 /*variant ID*/); } } #endif @@ -7610,13 +7612,13 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "NCCL Init Communicators Task"); + registrar, "NCCL Init Communicators Task", 111 /*variant ID*/); } else { if (enable_control_replication) { registrar.global_registration = false; } runtime->register_task_variant( - registrar); + registrar, 111 /*variant ID*/); } } { @@ -7626,12 +7628,13 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.set_leaf(); if (pre_register) { Runtime::preregister_task_variant( - registrar, "NCCL Finish Communicators Task"); + registrar, "NCCL Finish Communicators Task", 111 /*variant ID*/); } else { if (enable_control_replication) { registrar.global_registration = false; } - runtime->register_task_variant(registrar); + runtime->register_task_variant( + registrar, 111 /*variant ID*/); } } #endif From 64c258f3b43e19025889d728799d2bdedde9f732 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Fri, 27 Sep 2024 11:59:38 -0700 Subject: [PATCH 26/44] [FusedOp] Fix segment fault (#1511) * minor bug fix * fix --- src/ops/fused.cu | 69 ++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/src/ops/fused.cu b/src/ops/fused.cu index cab28181da..8f1212beb4 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -1678,77 +1678,77 @@ __host__ void FusedOp::backward_task(Task const *task, int sum = fused->numInputs + fused->numWeights + fused->numOutputs; assert(sum * 2 == (int)regions.size()); } - GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS]; + std::vector input_accessor; + std::vector input_grad_accessor; + std::vector weight_accessor; + std::vector weight_grad_accessor; + std::vector output_accessor; + std::vector output_grad_accessor; int roff = 0; assert(fused->numInputs <= MAX_NUM_INPUTS); for (int i = 0; i < fused->numInputs; i++) { - input_accessor[i] = + input_accessor.push_back( helperGetGenericTensorAccessorRO(fused->input_data_types[i], regions[i], task->regions[i], FID_DATA, ctx, - runtime); + runtime)); } roff += fused->numInputs; assert(fused->numWeights <= MAX_NUM_WEIGHTS); for (int i = 0; i < fused->numWeights; i++) { - weight_accessor[i] = + weight_accessor.push_back( helperGetGenericTensorAccessorRO(fused->weight_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); } roff += fused->numWeights; assert(fused->numOutputs <= MAX_NUM_OUTPUTS); for (int i = 0; i < fused->numOutputs; i++) { - output_accessor[i] = + output_accessor.push_back( helperGetGenericTensorAccessorRO(fused->output_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); } roff += fused->numOutputs; for (int i = 0; i < fused->numInputs; i++) { - input_grad_accessor[i] = + input_grad_accessor.push_back( helperGetGenericTensorAccessorRW(fused->input_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); assert(input_grad_accessor[i].domain == input_accessor[i].domain); } roff += fused->numInputs; for (int i = 0; i < fused->numWeights; i++) { - weight_grad_accessor[i] = + weight_grad_accessor.push_back( helperGetGenericTensorAccessorRW(fused->weight_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); assert(weight_grad_accessor[i].domain.get_volume() == weight_accessor[i].domain.get_volume()); } roff += fused->numWeights; for (int i = 0; i < fused->numOutputs; i++) { - output_grad_accessor[i] = + output_grad_accessor.push_back( helperGetGenericTensorAccessorRW(fused->output_data_types[i], regions[i + roff], task->regions[i + roff], FID_DATA, ctx, - runtime); + runtime)); assert(output_grad_accessor[i].domain == output_accessor[i].domain); } roff += fused->numOutputs; @@ -1767,12 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task, } int ioff = 0, woff = 0, ooff = 0; - GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS]; - GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS]; - GenericTensorAccessorW my_weight_grad_accessor[MAX_NUM_WEIGHTS]; - GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS]; // Do backpropagation in the reverse ordering for (int op = 0; op < fused->numOperators; op++) { ioff += fused->op_num_inputs[op]; @@ -1781,18 +1775,24 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int op = fused->numOperators - 1; op >= 0; op--) { + std::vector my_input_accessor; + std::vector my_weight_accessor; + std::vector my_output_accessor; + std::vector my_input_grad_accessor; + std::vector my_weight_grad_accessor; + std::vector my_output_grad_accessor; ioff -= fused->op_num_inputs[op]; woff -= fused->op_num_weights[op]; ooff -= fused->op_num_outputs[op]; for (int i = 0; i < fused->op_num_inputs[op]; i++) { int my_off = fused->op_input_idx[i + ioff]; if (fused->op_input_source[i + ioff] == SOURCE_INPUT) { - my_input_accessor[i] = input_accessor[my_off]; - my_input_grad_accessor[i] = input_grad_accessor[my_off]; + my_input_accessor.push_back(input_accessor[my_off]); + my_input_grad_accessor.push_back(input_grad_accessor[my_off]); assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) { - my_input_accessor[i] = output_accessor[my_off]; - my_input_grad_accessor[i] = output_grad_accessor[my_off]; + my_input_accessor.push_back(output_accessor[my_off]); + my_input_grad_accessor.push_back(output_grad_accessor[my_off]); assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain); } else { assert(false); @@ -1800,17 +1800,18 @@ __host__ void FusedOp::backward_task(Task const *task, } for (int i = 0; i < fused->op_num_weights[op]; i++) { assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT); - my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]]; - my_weight_grad_accessor[i] = - weight_grad_accessor[fused->op_weight_idx[i + woff]]; + my_weight_accessor.push_back( + weight_accessor[fused->op_weight_idx[i + woff]]); + my_weight_grad_accessor.push_back( + weight_grad_accessor[fused->op_weight_idx[i + woff]]); assert(my_weight_grad_accessor[i].domain.get_volume() == my_weight_accessor[i].domain.get_volume()); } for (int i = 0; i < fused->op_num_outputs[op]; i++) { assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT); int my_off = fused->op_output_idx[i + ooff]; - my_output_accessor[i] = output_accessor[my_off]; - my_output_grad_accessor[i] = output_grad_accessor[my_off]; + my_output_accessor.push_back(output_accessor[my_off]); + my_output_grad_accessor.push_back(output_grad_accessor[my_off]); assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain); } switch (fused->op_op_type[op]) { @@ -1880,7 +1881,7 @@ __host__ void FusedOp::backward_task(Task const *task, int num_inputs = fused->op_num_inputs[op]; Kernels::Concat::backward_kernel_wrapper(m, my_output_grad_accessor[0], - my_input_grad_accessor, + my_input_grad_accessor.data(), num_inputs, m->legion_axis); break; From c78cf04d348aa242c891c783e880e90806c88344 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 1 Oct 2024 20:03:18 -0700 Subject: [PATCH 27/44] enable disabling inference (#1516) --- .github/workflows/build.yml | 12 ++-- .github/workflows/gpu-ci.yml | 6 +- CMakeLists.txt | 105 ++++++++++++----------------------- config/config.inc | 20 +++---- config/config.linux | 6 +- spack/package.py | 4 +- src/c/flexflow_c.cc | 12 ++++ src/ops/beam_topk.cu | 2 +- src/runtime/model.cc | 4 ++ 9 files changed, 77 insertions(+), 94 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ef5961bc87..63e0b9037a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -79,13 +79,13 @@ jobs: export FF_CUDA_ARCH=70 export FF_HIP_ARCH=gfx1100,gfx1036 export hip_version=5.6 - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_TRAINING_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON else - export FF_BUILD_ALL_EXAMPLES=OFF + export FF_BUILD_TRAINING_EXAMPLES=OFF export FF_BUILD_UNIT_TESTS=OFF fi @@ -106,13 +106,13 @@ jobs: export FF_CUDA_ARCH=70 export FF_HIP_ARCH=gfx1100,gfx1036 export hip_version=5.6 - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then - export FF_BUILD_ALL_EXAMPLES=ON + export FF_BUILD_TRAINING_EXAMPLES=ON export FF_BUILD_UNIT_TESTS=ON else - export FF_BUILD_ALL_EXAMPLES=OFF + export FF_BUILD_TRAINING_EXAMPLES=OFF export FF_BUILD_UNIT_TESTS=OFF fi diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 00ca2df603..6ca50027d1 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -175,7 +175,7 @@ jobs: export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON mkdir build cd build ../config/config.linux @@ -262,8 +262,8 @@ jobs: run: | export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) - export FF_BUILD_ALL_EXAMPLES=ON - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + export FF_BUILD_TRAINING_EXAMPLES=ON + export FF_BUILD_INFERENCE=ON export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion pip install . --verbose diff --git a/CMakeLists.txt b/CMakeLists.txt index f06969ae04..4e24e1e54b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,14 @@ include(legion) # Not build FlexFlow if BUILD_LEGION_ONLY is ON if(NOT BUILD_LEGION_ONLY) + + # build binary options + option(FF_BUILD_INFERENCE "build all inference code and examples." ON) + option(FF_BUILD_TRAINING_EXAMPLES "build all training examples." OFF) + option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) + option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) + option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) + # NCCL if(FF_USE_NCCL) if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda") @@ -271,18 +279,23 @@ if(NOT BUILD_LEGION_ONLY) file(GLOB_RECURSE FLEXFLOW_HDR LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/include/*.h) - - #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h) file(GLOB_RECURSE FLEXFLOW_SRC LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cc) - list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc") - #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc) - set(FLEXFLOW_CPP_DRV_SRC - ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) + # exclude inference files if FF_BUILD_INFERENCE is off + if(NOT FF_BUILD_INFERENCE) + list(REMOVE_ITEM FLEXFLOW_HDR "${FLEXFLOW_ROOT}/include/request_manager.h") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/inference_manager.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/batch_config.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/beam_search_batch_config.cc") + list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/tree_verify_batch_config.cc") + endif() + + set(FLEXFLOW_CPP_DRV_SRC ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc) add_library(substitution_loader SHARED ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc) @@ -297,6 +310,10 @@ if(NOT BUILD_LEGION_ONLY) file(GLOB_RECURSE FLEXFLOW_GPU_SRC LIST_DIRECTORIES False ${FLEXFLOW_ROOT}/src/*.cu) + + if(NOT FF_BUILD_INFERENCE) + list(REMOVE_ITEM FLEXFLOW_GPU_SRC "${FLEXFLOW_ROOT}/src/runtime/request_manager.cu") + endif() add_compile_definitions(FF_USE_CUDA) @@ -452,27 +469,6 @@ if(NOT BUILD_LEGION_ONLY) set_property(TARGET flexflow PROPERTY CXX_STANDARD 14) endif() - # build binary - option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF) - option(FF_BUILD_RESNET "build resnet example" OFF) - option(FF_BUILD_RESNEXT "build resnext example" OFF) - option(FF_BUILD_ALEXNET "build alexnet example" OFF) - option(FF_BUILD_DLRM "build DLRM example" OFF) - option(FF_BUILD_XDL "build XDL example" OFF) - option(FF_BUILD_INCEPTION "build inception example" OFF) - option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF) - option(FF_BUILD_TRANSFORMER "build transformer example" OFF) - option(FF_BUILD_MOE "build mixture of experts example" OFF) - option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF) - option(FF_BUILD_SPLIT_TEST "build split test example" OFF) - option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF) - option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF) - option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF) - option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF) - option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF) - option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF) - option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF) - if(FF_BUILD_UNIT_TESTS) set(BUILD_GMOCK OFF) add_subdirectory(deps/googletest) @@ -488,89 +484,60 @@ if(NOT BUILD_LEGION_ONLY) add_subdirectory(tools/substitutions_to_dot) endif() - if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER) + if(FF_BUILD_INFERENCE) + add_compile_definitions(FF_BUILD_INFERENCE) # Ensure Rust is installed execute_process(COMMAND rustc --version RESULT_VARIABLE RUST_COMMAND_RESULT OUTPUT_VARIABLE RUSTC_OUTPUT ERROR_QUIET) if(NOT RUST_COMMAND_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + message(FATAL_ERROR + "Rust is not installed on the system. Please install it by running: \n" + "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n" + "and follow the instructions on the screen.") endif() # Ensure Cargo is installed execute_process(COMMAND cargo --version RESULT_VARIABLE CARGO_RESULT OUTPUT_QUIET ERROR_QUIET) if(NOT CARGO_RESULT EQUAL 0) - message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.") + message(FATAL_ERROR + "Rust is installed, but cargo is not. Please install it by running: \n" + "'curl https://sh.rustup.rs -sSf | sh -s -- -y' \n" + "and follow the instructions on the screen.") endif() set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON) add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL) target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include) target_link_libraries(flexflow tokenizers_cpp) endif() - if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES) + + if (FF_BUILD_TRAINING_EXAMPLES) add_subdirectory(examples/cpp/ResNet) - endif() - - if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/resnext50) - endif() - - if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/AlexNet) - endif() - - if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/MLP_Unify) - endif() - - if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test) - endif() - - if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/split_test_2) - endif() - - if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/InceptionV3) - endif() - - #TODO: Once functional add to BUILD_ALL_EXAMPLES - if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/candle_uno) - endif() - - if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/DLRM) - #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc) #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) - #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc) #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS}) - endif() - - if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/XDL) - endif() - - if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/Transformer) - endif() - - if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES) add_subdirectory(examples/cpp/mixture_of_experts) endif() - if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES) + if(FF_BUILD_INFERENCE) add_subdirectory(inference/spec_infer) add_subdirectory(inference/incr_decoding) add_subdirectory(inference/peft) endif() - # installation set(INCLUDE_DEST "include") set(LIB_DEST "lib") diff --git a/config/config.inc b/config/config.inc index 6431eaf136..011fe890fb 100644 --- a/config/config.inc +++ b/config/config.inc @@ -128,19 +128,19 @@ elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then fi # build C++ examples -if [ "$FF_BUILD_ALL_EXAMPLES" = "ON" ]; then - SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON" -elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then - SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=OFF" +if [ "$FF_BUILD_TRAINING_EXAMPLES" = "ON" ]; then + SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON" +elif [ "$FF_BUILD_TRAINING_EXAMPLES" = "OFF" ]; then + SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=OFF" else - SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON" + SET_EXAMPLES="-DFF_BUILD_TRAINING_EXAMPLES=ON" fi -if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then - SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" -elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then - SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF" +if [ "$FF_BUILD_INFERENCE" = "ON" ]; then + SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON" +elif [ "$FF_BUILD_INFERENCE" = "OFF" ]; then + SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=OFF" else - SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON" + SET_INFERENCE_EXAMPLES="-DFF_BUILD_INFERENCE=ON" fi # enable C++ unit tests diff --git a/config/config.linux b/config/config.linux index acffc210f5..09976cfa03 100755 --- a/config/config.linux +++ b/config/config.linux @@ -65,8 +65,8 @@ FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} UCX_DIR=${UCX_DIR:-""} # build C++ examples -FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF} -FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON} +FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES:-OFF} +FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE:-ON} # build C++ unit tests FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF} @@ -108,7 +108,7 @@ fi function get_build_configs() { # Create a string with the values of the variables set in this script - BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" + BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_TRAINING_EXAMPLES=${FF_BUILD_TRAINING_EXAMPLES} FF_BUILD_INFERENCE=${FF_BUILD_INFERENCE} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}" } if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then diff --git a/spack/package.py b/spack/package.py index 273cb30951..12ff294e94 100644 --- a/spack/package.py +++ b/spack/package.py @@ -91,9 +91,9 @@ def cmake_args(self): options.append('-DFF_USE_NCCL=OFF') if '+examples' in spec: - options.append('-DFF_BUILD_ALL_EXAMPLES=ON') + options.append('-DFF_BUILD_TRAINING_EXAMPLES=ON') else: - options.append('-DFF_BUILD_ALL_EXAMPLES=OFF') + options.append('-DFF_BUILD_TRAINING_EXAMPLES=OFF') if '+avx2' in spec: options.append('-DFF_USE_AVX2=ON') diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index e39cb29037..532dd00198 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -16,7 +16,9 @@ #include "flexflow/flexflow_c.h" #include "flexflow/dataloader.h" #include "flexflow/mapper.h" +#ifdef FF_BUILD_INFERENCE #include "flexflow/request_manager.h" +#endif #include "flexflow/utils/file_loader.h" using namespace Legion; @@ -58,6 +60,7 @@ class FFCObjectWrapper { FF_NEW_OPAQUE_WRAPPER(flexflow_dlrm_config_t, DLRMConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_single_dataloader_t, SingleDataLoader *); // inference +#ifdef FF_BUILD_INFERENCE FF_NEW_OPAQUE_WRAPPER(flexflow_batch_config_t, BatchConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_tree_verify_batch_config_t, TreeVerifyBatchConfig *); @@ -74,6 +77,7 @@ class FFCObjectWrapper { // LoraAdamOptimizerConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *); FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *); +#endif }; Logger ffc_log("flexflow_c"); @@ -1549,6 +1553,7 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, return FFCObjectWrapper::wrap(tensor); } +#ifdef FF_BUILD_INFERENCE flexflow_peft_model_id_t flexflow_model_add_lora_layer( flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_) { @@ -1563,6 +1568,7 @@ flexflow_peft_model_id_t flexflow_model_add_lora_layer( peft_model_id); return FFCObjectWrapper::wrap(peft_model_id); } +#endif void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_, flexflow_sgd_optimizer_t optimizer_) { @@ -1617,6 +1623,7 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) { handle->set_transformer_layer_id(id); } +#ifdef FF_BUILD_INFERENCE void flexflow_model_generate(flexflow_model_t handle_, int num_requests, enum RequestType *request_types, @@ -1697,6 +1704,7 @@ void flexflow_model_generate(flexflow_model_t handle_, } } } +#endif void flexflow_model_set_position_offset(flexflow_model_t handle_, int const offset) { @@ -2584,6 +2592,8 @@ void flexflow_perform_registration(void) { true /*global*/); } +#ifdef FF_BUILD_INFERENCE + // ----------------------------------------------------------------------- // BatchConfig // ----------------------------------------------------------------------- @@ -3052,3 +3062,5 @@ void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) { DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id); delete peft_model_id; } + +#endif diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu index bf4c23cad0..a7aee338e4 100644 --- a/src/ops/beam_topk.cu +++ b/src/ops/beam_topk.cu @@ -15,7 +15,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/ops/beam_topk.h" -#include "flexflow/request_manager.h" +// #include "flexflow/request_manager.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { diff --git a/src/runtime/model.cc b/src/runtime/model.cc index ceb9277b76..5213633e73 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -71,7 +71,9 @@ #include "flexflow/parallel_ops/partition.h" #include "flexflow/parallel_ops/reduction.h" #include "flexflow/parallel_ops/replicate.h" +#ifdef FF_BUILD_INFERENCE #include "flexflow/request_manager.h" +#endif #include "flexflow/substitution.h" #include "flexflow/utils/random_utils.h" #include "flexflow/utils/test_utils.h" @@ -4684,6 +4686,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } +#ifdef FF_BUILD_INFERENCE // RequestManager load_tokens { TaskVariantRegistrar registrar(RM_LOAD_TOKENS_TASK_ID, @@ -4837,6 +4840,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } +#endif // ElementUnary task { TaskVariantRegistrar registrar(ELEMENTUNARY_INIT_TASK_ID, From ca3dabf7d23cf2173fca830249c4cb9eeb6171bf Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 5 Oct 2024 11:36:34 -0700 Subject: [PATCH 28/44] [AllReduce] make AllReduce tasks concurrent in FlexFlow (#1517) * minor bug fix * make AllReduce tasks concurrent * set concurrent=true for remaining operators --------- Co-authored-by: Gabriele Oliaro --- src/ops/fused.cc | 6 ++++++ src/ops/lora_linear.cc | 2 ++ src/parallel_ops/allreduce.cc | 5 +++++ src/parallel_ops/parallel_identity.cc | 4 ++++ src/runtime/model.cc | 23 +++++++++++++++++++++++ 5 files changed, 40 insertions(+) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 121139beb1..720d678a4a 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -476,6 +476,7 @@ void FusedOp::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { @@ -570,6 +571,7 @@ void FusedOp::init_inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { @@ -604,6 +606,7 @@ void FusedOp::forward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; int offset = 0; for (int i = 0; i < numInputs; i++) { assert(inputs[i]->part != LogicalPartition::NO_PART); @@ -659,6 +662,7 @@ FutureMap FusedOp::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { @@ -735,6 +739,7 @@ FutureMap FusedOp::peft_bwd(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); int offset = 0; for (int i = 0; i < numInputs; i++) { @@ -787,6 +792,7 @@ void FusedOp::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; int idx = 0; for (int i = 0; i < numInputs; i++) { launcher.add_region_requirement(RegionRequirement(inputs[i]->part, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index fde6bc2b28..513147f3b7 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -296,6 +296,7 @@ void LoraLinear::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -795,6 +796,7 @@ FutureMap LoraLinear::peft_bwd(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 52c4ec2e28..dc43d80133 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -131,6 +131,7 @@ void AllReduce::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -164,6 +165,7 @@ void AllReduce::forward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -212,6 +214,7 @@ void AllReduce::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, inputs[0]->machine_view.hash()); + // launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -265,6 +268,7 @@ void AllReduce::init_inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -306,6 +310,7 @@ FutureMap AllReduce::inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc index 883910ae09..7d68036709 100644 --- a/src/parallel_ops/parallel_identity.cc +++ b/src/parallel_ops/parallel_identity.cc @@ -133,6 +133,7 @@ void ParallelIdentity::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -214,6 +215,7 @@ void ParallelIdentity::backward(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, inputs[0]->machine_view.hash()); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad, 0 /*projection id*/, READ_WRITE, @@ -268,6 +270,7 @@ void ParallelIdentity::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -381,6 +384,7 @@ FutureMap false /*must*/, 0 /*mapper_id*/, machine_view_hash); + launcher.concurrent = true; launcher.add_future(bc); launcher.add_region_requirement( RegionRequirement(batch_inputs[0]->part_grad, diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5213633e73..52f1dd2220 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -6888,6 +6888,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "LoraLinear Init Task"); @@ -6919,6 +6920,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "LoraLinear PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "LoraLinear PEFT Backward Task"); @@ -6950,6 +6952,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Init Task"); @@ -6964,6 +6967,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Inference Task"); @@ -6979,6 +6983,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "FusedOp PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp PEFT Backward Task"); @@ -6994,6 +6999,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Forward Task"); @@ -7008,6 +7014,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_BWD_TASK_ID, "FusedOp Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Backward Task"); @@ -7244,6 +7251,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce init Task"); @@ -7258,6 +7266,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Forward Task"); @@ -7272,6 +7283,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + // registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Backward Task"); @@ -7287,6 +7301,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, "AllReduce Inference"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Inference Task"); @@ -7302,6 +7319,9 @@ void register_flexflow_internal_tasks(Runtime *runtime, "AllReduce PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + // AllReduce forward and backward must run concurrently since they + // use ncclAllReduce internally + // registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce PEFT Backward Task"); @@ -7318,6 +7338,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "ParallelIdentity Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity init Task"); @@ -7349,6 +7370,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "ParallelIdentity Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity Backward Task"); @@ -7381,6 +7403,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, "ParallelIdentity PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity PEFT Backward Task"); From 96628b360efb6a0299dd9a3a652a91249b722231 Mon Sep 17 00:00:00 2001 From: Yingcheng <135535812+yingchen21@users.noreply.github.com> Date: Thu, 10 Oct 2024 06:27:49 +0800 Subject: [PATCH 29/44] Attention projections (QKV, O) disaggregation (#1436) * merged attn-qkv-proj into peft. commented out some alignment test, but should be equivalent to the oriinal test. * restored and passed the alignement test * linting * rebased onto inference * Bug fixes, uploaded missing cpp implmentation * Code cleanup * clean up * fixed problem with mpt. * update * llama3.1 support * fix * support llama3.2 * fix opt bias? * opt alignment test stub * fix bias * update * fix non-fusion opt * update * fix * cleanup * delete file * cleanup * shellcheck * hip cleanup * fix * hip fixes --------- Co-authored-by: Gabriele Oliaro Co-authored-by: zhihao Co-authored-by: Gabriele Oliaro --- .gitignore | 3 + .../ops/inc_multihead_self_attention.py | 6 - .../inc_multihead_self_attention_verify.py | 6 - .../ops/inc_multiquery_self_attention.py | 6 - .../inc_multiquery_self_attention_verify.py | 6 - .../ops/spec_inc_multihead_self_attention.py | 6 - .../ops/spec_inc_multiquery_self_attention.py | 6 - include/flexflow/flexflow_c.h | 48 +- include/flexflow/inference.h | 39 +- include/flexflow/layer.h | 3 + include/flexflow/model.h | 146 +- include/flexflow/operator.h | 8 +- .../ops/inc_multihead_self_attention.h | 54 +- .../ops/inc_multihead_self_attention_params.h | 5 +- .../inc_multihead_self_attention_kernels.h | 49 +- .../ops/spec_inc_multihead_self_attention.h | 25 +- ...spec_inc_multihead_self_attention_params.h | 4 +- .../ops/tree_inc_multihead_self_attention.h | 26 +- ...tree_inc_multihead_self_attention_params.h | 4 +- inference/models/falcon.cc | 81 +- inference/models/falcon.h | 29 +- inference/models/llama.cc | 72 +- inference/models/llama.h | 29 +- inference/models/mpt.cc | 54 +- inference/models/mpt.h | 2 + inference/models/opt.cc | 62 +- inference/models/opt.h | 9 +- inference/models/starcoder.cc | 55 +- inference/models/starcoder.h | 4 +- inference/python/incr_decoding.py | 10 +- python/flexflow/core/flexflow_cffi.py | 161 +- python/flexflow/serve/models/falcon.py | 56 +- python/flexflow/serve/models/llama.py | 56 +- python/flexflow/serve/models/mpt.py | 46 +- python/flexflow/serve/models/opt.py | 45 +- python/flexflow/serve/models/starcoder.py | 32 +- src/c/flexflow_c.cc | 114 +- src/ops/add_bias_residual_layer_norm.cc | 14 +- src/ops/fused.cpp | 48 +- src/ops/fused.cu | 55 +- src/ops/inc_multihead_self_attention.cc | 496 +-- src/ops/inc_multihead_self_attention.cpp | 1646 ++++----- src/ops/inc_multihead_self_attention.cu | 2972 ++++++++--------- src/ops/kernels/linear_kernels.cu | 1 + src/ops/linear.cc | 6 +- src/ops/residual_layer_norm.cc | 17 +- src/ops/spec_inc_multihead_self_attention.cc | 415 +-- src/ops/spec_inc_multihead_self_attention.cpp | 1056 +++--- src/ops/spec_inc_multihead_self_attention.cu | 101 +- src/ops/tree_inc_multihead_self_attention.cc | 385 +-- src/ops/tree_inc_multihead_self_attention.cpp | 411 +-- src/ops/tree_inc_multihead_self_attention.cu | 409 +-- src/parallel_ops/allreduce.cc | 2 +- src/runtime/file_loader.cc | 406 ++- src/runtime/graph.cc | 107 +- src/runtime/inference_manager.cc | 1 + src/runtime/layer.cc | 17 + src/runtime/model.cc | 51 +- src/runtime/operator.cc | 12 + src/runtime/substitution.cc | 5 +- tests/fine_grained_alignment_test.sh | 106 + tests/inference/huggingface_inference.py | 49 +- tests/inference/inference_alignment_test.py | 817 +++++ tests/peft/alignment/align_test_utils.py | 13 +- tests/peft/hf_finetune.py | 2 +- tests/peft/hf_utils.py | 15 +- tests/peft/peft_alignment_test.py | 39 +- 67 files changed, 5146 insertions(+), 5895 deletions(-) create mode 100755 tests/fine_grained_alignment_test.sh create mode 100644 tests/inference/inference_alignment_test.py diff --git a/.gitignore b/.gitignore index cc34c1a7b6..c1e22fcaba 100644 --- a/.gitignore +++ b/.gitignore @@ -193,3 +193,6 @@ lora_training_logs Untitled-1.ipynb Untitled-2.ipynb tests/inference/python_test_configs/*.json + +core.* +fine_grained_alignment_config.json diff --git a/examples/python/native/ops/inc_multihead_self_attention.py b/examples/python/native/ops/inc_multihead_self_attention.py index dce7bd565d..ab80a5893c 100644 --- a/examples/python/native/ops/inc_multihead_self_attention.py +++ b/examples/python/native/ops/inc_multihead_self_attention.py @@ -11,8 +11,6 @@ def test_inc_multihead_self_attention( kdim: int = 0, vdim: int = 0, dropout: float = 0.0, - bias: bool = True, - add_bias_kv: bool = False, add_zero_attn: bool = False, data_type: DataType = DataType.DT_NONE, kernel_initializer=None, @@ -34,8 +32,6 @@ def test_inc_multihead_self_attention( kdim=kdim, vdim=vdim, dropout=dropout, - bias=bias, - add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, data_type=data_type, kernel_initializer=kernel_initializer, @@ -85,8 +81,6 @@ def test_inc_multihead_self_attention( kdim=0, # Example value for kdim vdim=0, # Example value for vdim dropout=0.1, # Example value for dropout - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_FLOAT, kernel_initializer=None, # Example value for kernel_initializer diff --git a/examples/python/native/ops/inc_multihead_self_attention_verify.py b/examples/python/native/ops/inc_multihead_self_attention_verify.py index f6dc8e3933..bc2ba5e977 100644 --- a/examples/python/native/ops/inc_multihead_self_attention_verify.py +++ b/examples/python/native/ops/inc_multihead_self_attention_verify.py @@ -11,8 +11,6 @@ def test_inc_multihead_self_attention_verify( kdim: int = 0, vdim: int = 0, dropout: float = 0.0, - bias: bool = True, - add_bias_kv: bool = False, add_zero_attn: bool = False, data_type: DataType = DataType.DT_NONE, kernel_initializer=None, @@ -34,8 +32,6 @@ def test_inc_multihead_self_attention_verify( kdim=kdim, vdim=vdim, dropout=dropout, - bias=bias, - add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, data_type=data_type, kernel_initializer=kernel_initializer, @@ -85,8 +81,6 @@ def test_inc_multihead_self_attention_verify( kdim=0, # Example value for kdim vdim=0, # Example value for vdim dropout=0.1, # Example value for dropout - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_FLOAT, kernel_initializer=None, # Example value for kernel_initializer diff --git a/examples/python/native/ops/inc_multiquery_self_attention.py b/examples/python/native/ops/inc_multiquery_self_attention.py index 33390ab1f6..424b46b0f4 100644 --- a/examples/python/native/ops/inc_multiquery_self_attention.py +++ b/examples/python/native/ops/inc_multiquery_self_attention.py @@ -12,8 +12,6 @@ def test_inc_multiquery_self_attention( kdim: int = 0, vdim: int = 0, dropout: float = 0.0, - bias: bool = True, - add_bias_kv: bool = False, add_zero_attn: bool = False, data_type: DataType = DataType.DT_NONE, kernel_initializer=None, @@ -36,8 +34,6 @@ def test_inc_multiquery_self_attention( kdim=kdim, vdim=vdim, dropout=dropout, - bias=bias, - add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, data_type=data_type, kernel_initializer=kernel_initializer, @@ -89,8 +85,6 @@ def test_inc_multiquery_self_attention( kdim=0, # Example value for kdim vdim=0, # Example value for vdim dropout=0.1, # Example value for dropout - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_FLOAT, kernel_initializer=None, # Example value for kernel_initializer diff --git a/examples/python/native/ops/inc_multiquery_self_attention_verify.py b/examples/python/native/ops/inc_multiquery_self_attention_verify.py index 69a76f68bf..b2c0e7dcf5 100644 --- a/examples/python/native/ops/inc_multiquery_self_attention_verify.py +++ b/examples/python/native/ops/inc_multiquery_self_attention_verify.py @@ -12,8 +12,6 @@ def test_inc_multiquery_self_attention_verify( kdim: int = 0, vdim: int = 0, dropout: float = 0.0, - bias: bool = True, - add_bias_kv: bool = False, add_zero_attn: bool = False, data_type: DataType = DataType.DT_NONE, kernel_initializer=None, @@ -36,8 +34,6 @@ def test_inc_multiquery_self_attention_verify( kdim=kdim, vdim=vdim, dropout=dropout, - bias=bias, - add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, data_type=data_type, kernel_initializer=kernel_initializer, @@ -89,8 +85,6 @@ def test_inc_multiquery_self_attention_verify( kdim=0, # Example value for kdim vdim=0, # Example value for vdim dropout=0.1, # Example value for dropout - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_FLOAT, kernel_initializer=None, # Example value for kernel_initializer diff --git a/examples/python/native/ops/spec_inc_multihead_self_attention.py b/examples/python/native/ops/spec_inc_multihead_self_attention.py index bd1aaa189b..d0fa5f7689 100644 --- a/examples/python/native/ops/spec_inc_multihead_self_attention.py +++ b/examples/python/native/ops/spec_inc_multihead_self_attention.py @@ -11,8 +11,6 @@ def test_spec_inc_multihead_self_attention( kdim: int = 0, vdim: int = 0, dropout: float = 0.0, - bias: bool = True, - add_bias_kv: bool = False, add_zero_attn: bool = False, data_type: DataType = DataType.DT_NONE, kernel_initializer=None, @@ -34,8 +32,6 @@ def test_spec_inc_multihead_self_attention( kdim=kdim, vdim=vdim, dropout=dropout, - bias=bias, - add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, data_type=data_type, kernel_initializer=kernel_initializer, @@ -85,8 +81,6 @@ def test_spec_inc_multihead_self_attention( kdim=0, # Example value for kdim vdim=0, # Example value for vdim dropout=0.1, # Example value for dropout - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_FLOAT, kernel_initializer=None, # Example value for kernel_initializer diff --git a/examples/python/native/ops/spec_inc_multiquery_self_attention.py b/examples/python/native/ops/spec_inc_multiquery_self_attention.py index 0b731c99e0..0d04f639c9 100644 --- a/examples/python/native/ops/spec_inc_multiquery_self_attention.py +++ b/examples/python/native/ops/spec_inc_multiquery_self_attention.py @@ -12,8 +12,6 @@ def test_spec_inc_multiquery_self_attention( kdim: int = 0, vdim: int = 0, dropout: float = 0.0, - bias: bool = True, - add_bias_kv: bool = False, add_zero_attn: bool = False, data_type: DataType = DataType.DT_NONE, kernel_initializer=None, @@ -36,8 +34,6 @@ def test_spec_inc_multiquery_self_attention( kdim=kdim, vdim=vdim, dropout=dropout, - bias=bias, - add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, data_type=data_type, kernel_initializer=kernel_initializer, @@ -89,8 +85,6 @@ def test_spec_inc_multiquery_self_attention( kdim=0, # Example value for kdim vdim=0, # Example value for vdim dropout=0.1, # Example value for dropout - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_FLOAT, kernel_initializer=None, # Example value for kernel_initializer diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 52b4b3d362..c1e18e660b 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -445,12 +445,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -465,12 +469,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -485,12 +493,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -506,12 +518,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -527,12 +543,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -548,12 +568,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h index ba4101c173..755df9f5cb 100644 --- a/include/flexflow/inference.h +++ b/include/flexflow/inference.h @@ -43,8 +43,43 @@ struct GenerationResult { std::vector finetuning_losses; }; -#include -#include +struct RotaryEmbeddingMeta { + bool apply_rotary_embedding = false; + float rope_theta = 10000.0f; + std::string rope_type = "default"; + float factor = 8.0f; + float low_freq_factor = 1.0f; + float high_freq_factor = 4.0f; + int original_max_position_embeddings = 8192; + + RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false, + float rope_theta_ = 10000.0f, + std::string rope_type_ = "default", + float factor_ = 8.0f, + float low_freq_factor_ = 1.0f, + float high_freq_factor_ = 4.0f, + int original_max_position_embeddings_ = 8192) + : apply_rotary_embedding(apply_rotary_embedding_), + rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_), + low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_), + original_max_position_embeddings(original_max_position_embeddings_) {} + + friend std::ostream &operator<<(std::ostream &os, + RotaryEmbeddingMeta const &meta) { + os << std::boolalpha // To print bool as true/false instead of 1/0 + << "RotaryEmbeddingMeta {\n" + << " apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n" + << " rope_theta: " << meta.rope_theta << ",\n" + << " rope_type: \"" << meta.rope_type << "\",\n" + << " factor: " << meta.factor << ",\n" + << " low_freq_factor: " << meta.low_freq_factor << ",\n" + << " high_freq_factor: " << meta.high_freq_factor << ",\n" + << " original_max_position_embeddings: " + << meta.original_max_position_embeddings << "\n" + << "}"; + return os; + } +}; std::string join_path(std::vector const &paths); diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h index c3dbcac422..e18bad3982 100644 --- a/include/flexflow/layer.h +++ b/include/flexflow/layer.h @@ -32,11 +32,13 @@ class Layer { void add_float_property(std::string const &key, float value); void add_int_vector_property(std::string const &key, std::vector const &value); + void add_string_property(std::string const &key, std::string const &value); void add_initializer(std::string const &key, Initializer *initializer); bool get_int_property(std::string const &key, long long &value) const; bool get_float_property(std::string const &key, float &value) const; bool get_int_vector_property(std::string const &key, std::vector &value) const; + bool get_string_property(std::string const &key, std::string &value) const; bool get_initializer(std::string const &key, Initializer *&initializer) const; Tensor get_parameter(int index); void print(); @@ -59,6 +61,7 @@ class Layer { std::unordered_map float_properties; std::unordered_map initializers; std::unordered_map> int_vector_properties; + std::unordered_map string_properties; }; }; // namespace FlexFlow diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 4ad735ef7d..51b7950db8 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -733,41 +733,38 @@ class FFModel { DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, char const *name = NULL); - Tensor inc_multihead_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); - Tensor - spec_inc_multihead_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); + Tensor inc_multihead_self_attention( + const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); + Tensor spec_inc_multihead_self_attention( + const Tensor input, + int embed_dim, + int num_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); Tensor inc_multihead_self_attention_verify( const Tensor input, int embed_dim, @@ -775,54 +772,49 @@ class FFModel { int kdim = 0, int vdim = 0, float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, bool add_zero_attn = false, DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); + Tensor inc_multiquery_self_attention( + const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), + bool scaling_query = false, + float scaling_factor = 1.0f, + bool qk_prod_scaling = true, + bool position_bias = false, + char const *name = NULL); + Tensor spec_inc_multiquery_self_attention( + const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim = 0, + int vdim = 0, + float dropout = 0.0f, + bool add_zero_attn = false, + DataType data_type = DT_NONE, + Initializer *kernel_initializer = NULL, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, bool position_bias = false, char const *name = NULL); - Tensor inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); - Tensor - spec_inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim = 0, - int vdim = 0, - float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, - bool add_zero_attn = false, - DataType data_type = DT_NONE, - Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, - bool scaling_query = false, - float scaling_factor = 1.0f, - bool qk_prod_scaling = true, - bool position_bias = false, - char const *name = NULL); Tensor inc_multiquery_self_attention_verify( const Tensor input, int embed_dim, @@ -831,12 +823,10 @@ class FFModel { int kdim = 0, int vdim = 0, float dropout = 0.0f, - bool bias = false, - bool add_bias_kv = false, bool add_zero_attn = false, DataType data_type = DT_NONE, Initializer *kernel_initializer = NULL, - bool apply_rotary_embedding = false, + RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(), bool scaling_query = false, float scaling_factor = 1.0f, bool qk_prod_scaling = true, diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 1a5af67b36..007314797a 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -335,7 +335,13 @@ class Op { // only dump the weights in the forward pass, at the first step // note that we do not save the weight gradients, since we only support // finetuning LoRA weights, which are not FF tensors. - if (fwd_pass && m->decoding_step == 0) { + // Set FF_DEBG_NO_WEIGHTS=1 or to FF_DEBG_NO_WEIGHTS=true to disable saving + // weights + bool do_not_save_weights = + (std::getenv("FF_DEBG_NO_WEIGHTS") && + (std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "1" || + std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "true")); + if (fwd_pass && m->decoding_step == 0 && !do_not_save_weights) { fs::path dst_filepath_weights = get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) / layername; diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index f77df7c456..4519cf8215 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -36,49 +36,40 @@ class IncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, char const *name); IncMultiHeadSelfAttention(FFModel &model, ParallelTensor const _input, - ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, char const *name); IncMultiHeadSelfAttention(FFModel &model, IncMultiHeadSelfAttention const &other, - ParallelTensor const input, - bool allocate_weights); + ParallelTensor const input); IncMultiHeadSelfAttention(FFModel &model, Params const ¶ms, Input const &inputs, - bool allocate_weights = false, char const *name = nullptr); static Op * create_operator_from_layer(FFModel &model, @@ -125,24 +116,20 @@ class IncMultiHeadSelfAttention : public Op { BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); - static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, - BatchConfig const *bc, - int shard_id, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &weight, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &bias); + GenericTensorAccessorW const &output); + static void + peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output_grad); Params get_params() const; public: int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; DataType quantization_type; @@ -153,7 +140,6 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { public: IncMultiHeadSelfAttentionMeta(FFHandler handler, IncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, @@ -168,14 +154,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { int _kProjSize, int _vProjSize, int _oProjSize, - bool _apply_rotary_embedding, - bool _qkv_bias, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, bool _qk_prod_scaling, bool _position_bias, - bool _final_bias, float _scaling_factor, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _global_num_q_heads, @@ -188,30 +171,23 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { public: Realm::RegionInstance reserveInst; - size_t weights_params, weightSize, biasSize, reserveSpaceSize, - quantized_weightSize; + size_t reserveSpaceSize; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads, hidden_size; - bool *has_load_weights; - bool *apply_rotary_embedding; - bool *qkv_bias; - bool *final_bias; + RotaryEmbeddingMeta *rotary_embedding_meta; bool *scaling_query; bool *qk_prod_scaling; bool *position_bias; float scaling_factor; - void *weight_ptr, *bias_ptr; // for weight offload void *devQKVProjArray, *keyCache, *valueCache; void *qk_prods, *qk_prods_softmax; void *attn_heads; - char *quantized_weight_ptr; BatchConfig::PerTokenInfo *token_infos; BatchConfig::PerRequestInfo *request_infos; DataType quantization_type; bool offload; #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) - // cudaStream_t task_local_stream; cudnnTensorDescriptor_t qk_tensor; cuFloatComplex *complex_input; #elif defined(FF_USE_HIP_ROCM) diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h index 58681069e2..9b0a26e5d7 100644 --- a/include/flexflow/ops/inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/inc_multihead_self_attention_params.h @@ -3,6 +3,7 @@ #include "flexflow/ffconst.h" #include "flexflow/fftype.h" +#include "flexflow/inference.h" #include "flexflow/parallel_tensor.h" namespace FlexFlow { @@ -12,8 +13,8 @@ struct IncMultiHeadSelfAttentionParams { int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; bool offload; char name[MAX_OPNAME]; diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h index 26dcf12425..16d5915381 100644 --- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h +++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h @@ -14,6 +14,11 @@ namespace FlexFlow { namespace Kernels { namespace IncMultiHeadAttention { +template +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + ffStream_t stream); template void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, @@ -21,14 +26,11 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, ffStream_t stream); template -void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *weight_ptr, - DT const *bias_ptr, - int num_tokens, - ffStream_t stream); +void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + ffStream_t stream); template __global__ void apply_position_bias_qkprd(DT *input_ptr, @@ -38,27 +40,6 @@ __global__ void apply_position_bias_qkprd(DT *input_ptr, int global_num_q_heads, int shard_id); -template -__global__ void apply_proj_bias_w(DT *input_ptr, - DT const *bias_ptr, - int num_tokens, - int qkv_weight_size, - int oProjSize); - -template -__global__ void apply_proj_bias_qkv(DT *input_ptr, - DT const *bias_ptr, - int shard_id, - int num_tokens, - int qProjSize, - int kProjSize, - int vProjSize, - int num_heads, - int num_kv_heads, - bool scaling_query, - float scaling_factor, - int hidden_size); - #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) template __global__ void @@ -91,16 +72,6 @@ __global__ void bool q_tensor); #endif -template -void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - ffStream_t stream); - template void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, GenericTensorAccessorR const weight, diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h index a0d01092bf..155132a7fe 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h @@ -33,43 +33,34 @@ class SpecIncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, const ParallelTensor _input, - const ParallelTensor _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, char const *name); SpecIncMultiHeadSelfAttention(FFModel &model, SpecIncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights); + const ParallelTensor input); SpecIncMultiHeadSelfAttention(FFModel &model, Params const ¶ms, Input const &inputs, - bool allocate_weights = false, char const *name = nullptr); static Op * create_operator_from_layer(FFModel &model, @@ -112,17 +103,14 @@ class SpecIncMultiHeadSelfAttention : public Op { BeamSearchBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); + GenericTensorAccessorW const &output); Params get_params() const; public: int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; }; @@ -131,7 +119,6 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: SpecIncMultiHeadSelfAttentionMeta(FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h index 1461224ba9..a0ae3fc4f2 100644 --- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h @@ -11,8 +11,8 @@ struct SpecIncMultiHeadSelfAttentionParams { LayerID layer_guid; int embed_dim, num_q_heads, num_kv_heads, kdim, vdim; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; }; diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h index 168ad5f618..9755e62d42 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h @@ -33,49 +33,40 @@ class TreeIncMultiHeadSelfAttention : public Op { int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, const ParallelTensor _input, - const ParallelTensor _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, char const *name); TreeIncMultiHeadSelfAttention(FFModel &model, TreeIncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights); + const ParallelTensor input); TreeIncMultiHeadSelfAttention(FFModel &model, Params const ¶ms, Input const &inputs, - bool allocate_weights = false, char const *name = nullptr); static Op * create_operator_from_layer(FFModel &model, @@ -114,18 +105,14 @@ class TreeIncMultiHeadSelfAttention : public Op { TreeVerifyBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias); - + GenericTensorAccessorW const &output); Params get_params() const; public: int num_q_heads, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias; - bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query, - qk_prod_scaling, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize; int qoSeqLength, kvSeqLength; DataType quantization_type; @@ -136,7 +123,6 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta { public: TreeIncMultiHeadSelfAttentionMeta(FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h index d1a51b8b8f..b49db2c10d 100644 --- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h +++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h @@ -12,8 +12,8 @@ struct TreeIncMultiHeadSelfAttentionParams { int embed_dim, num_q_heads, kdim, vdim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; bool offload; char name[MAX_OPNAME]; diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index 195d6ba7e3..fd4da87b99 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -60,6 +60,7 @@ void FALCON::create_falcon_model(FFModel &ff, "word_embeddings"); Tensor mha = nullptr, mlp_output = nullptr; + Tensor qkv_proj = nullptr, o_proj = nullptr; Tensor res_ln_outputs[2] = {nullptr, nullptr}; for (int i = 0; i < falcon_config.n_layer; i++) { @@ -97,26 +98,41 @@ void FALCON::create_falcon_model(FFModel &ff, att_norm = res_ln_outputs[1]; } + qkv_proj = ff.dense( + att_norm, + falcon_config.hidden_size * + 3, // q, k, v. need to change if want to remove replication. + // (q_heads + 2 * kv_heads) * proj_size + AC_MODE_NONE, + false, // seems like it does not use bias + DT_NONE, // what is this + nullptr, // ? + nullptr, // ? + nullptr, // ? + REG_MODE_NONE, // no regularization + 0.0f, // no dropout + std::string("layers." + std::to_string(i) + ".self_attention.qkv_proj") + .c_str()); + qkv_proj->print("qkv_proj"); + switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multiquery_self_attention( - att_norm, + o_proj = ff.spec_inc_multiquery_self_attention( + qkv_proj, falcon_config.hidden_size, falcon_config.n_head, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ + falcon_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); @@ -124,24 +140,22 @@ void FALCON::create_falcon_model(FFModel &ff, } case TREE_VERIFY_MODE: { - mha = ff.inc_multiquery_self_attention_verify( - att_norm, + o_proj = ff.inc_multiquery_self_attention_verify( + qkv_proj, falcon_config.hidden_size, falcon_config.n_head, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ + falcon_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); @@ -149,24 +163,22 @@ void FALCON::create_falcon_model(FFModel &ff, } case INC_DECODING_MODE: { - mha = ff.inc_multiquery_self_attention( - att_norm, + o_proj = ff.inc_multiquery_self_attention( + qkv_proj, falcon_config.hidden_size, falcon_config.n_head, falcon_config.n_head_kv, falcon_config.hidden_size / falcon_config.n_head, falcon_config.hidden_size / falcon_config.n_head, 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ + falcon_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ std::string("layers." + std::to_string(i) + ".self_attention") .c_str() /*name*/ ); @@ -177,6 +189,21 @@ void FALCON::create_falcon_model(FFModel &ff, } } + mha = ff.dense( + o_proj, + falcon_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".self_attention.o_proj") + .c_str()); + mha->print("mha"); + Tensor dense_h_to_4h = ff.dense( att_norm, falcon_config.hidden_size * 4, diff --git a/inference/models/falcon.h b/inference/models/falcon.h index fce2dade3f..565d7e5419 100644 --- a/inference/models/falcon.h +++ b/inference/models/falcon.h @@ -50,6 +50,26 @@ class FALCON { : model_config["num_hidden_layers"]; parallel_attn = model_config["parallel_attn"]; vocab_size = model_config["vocab_size"]; + rotary_embedding_meta.apply_rotary_embedding = true; + if (model_config.find("rope_theta") != model_config.end()) { + rotary_embedding_meta.rope_theta = model_config["rope_theta"]; + } else { + rotary_embedding_meta.rope_theta = 10000.0f; + } + if (model_config.find("scaling_factor") != model_config.end() && + !model_config["scaling_factor"].is_null()) { + rotary_embedding_meta.rope_type = + model_config["scaling_factor"]["rope_type"]; + rotary_embedding_meta.factor = + model_config["scaling_factor"]["factor"]; + rotary_embedding_meta.low_freq_factor = + model_config["scaling_factor"]["low_freq_factor"]; + rotary_embedding_meta.high_freq_factor = + model_config["scaling_factor"]["high_freq_factor"]; + rotary_embedding_meta.original_max_position_embeddings = + model_config["scaling_factor"] + ["original_max_position_embeddings"]; + } } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); @@ -59,8 +79,6 @@ class FALCON { << std::endl; assert(false); } - // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -76,9 +94,8 @@ class FALCON { std::cout << "\tn_layer: " << n_layer << std::endl; std::cout << "\tparallel_attn: " << parallel_attn << std::endl; std::cout << "\tvocab_size: " << vocab_size << std::endl; - - // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; - // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta + << std::endl; std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } @@ -86,8 +103,8 @@ class FALCON { bool bias, multi_query, parallel_attn; int hidden_size, n_head, n_head_kv, n_layer, vocab_size; float layer_norm_epsilon; - // int max_seq_len, max_num_tokens; int max_beam_width, max_beam_depth; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_falcon_model(FFModel &ff, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index cf26194597..bd5243bd4b 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -91,28 +91,41 @@ void LLAMA::create_llama_model(FFModel &ff, token = token_att_norm[0]; att_norm = token_att_norm[1]; } + Tensor qkv_proj = ff.dense( + att_norm, + llama_config.hidden_size * + 3, // q, k, v. need to change if want to remove replication. + // (q_heads + 2 * kv_heads) * proj_size + AC_MODE_NONE, + false, // seems like llama does not use bias + DT_NONE, // what is this + nullptr, // ? + nullptr, // ? + nullptr, // ? + REG_MODE_NONE, // no regularization + 0.0f, // no dropout + std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj") + .c_str()); Tensor mha; switch (mode) { case BEAM_SEARCH_MODE: { mha = ff.spec_inc_multiquery_self_attention( - att_norm, + qkv_proj, llama_config.hidden_size, llama_config.num_attention_heads, llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ + llama_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); @@ -120,23 +133,21 @@ void LLAMA::create_llama_model(FFModel &ff, } case TREE_VERIFY_MODE: { mha = ff.inc_multiquery_self_attention_verify( - att_norm, + qkv_proj, llama_config.hidden_size, llama_config.num_attention_heads, llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ + llama_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); @@ -144,23 +155,21 @@ void LLAMA::create_llama_model(FFModel &ff, } case INC_DECODING_MODE: { mha = ff.inc_multiquery_self_attention( - att_norm, + qkv_proj, llama_config.hidden_size, llama_config.num_attention_heads, llama_config.num_key_value_heads, llama_config.hidden_size / llama_config.num_attention_heads, llama_config.hidden_size / llama_config.num_attention_heads, 0.0f, /*dropout*/ - false, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ nullptr, /*kernel_initializer*/ - true, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ + llama_config.rotary_embedding_meta, + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ std::string("layers." + std::to_string(i) + ".self_attn") .c_str() /*name*/ ); @@ -171,6 +180,21 @@ void LLAMA::create_llama_model(FFModel &ff, } } + Tensor mha_input = mha; + mha = ff.dense( + mha_input, + llama_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".self_attn.o_proj") + .c_str()); + // step 2: SILU activaion Tensor token_ff_norm[2] = {nullptr, nullptr}; ff.residual_rms_norm( diff --git a/inference/models/llama.h b/inference/models/llama.h index edb78f1300..853a51a999 100644 --- a/inference/models/llama.h +++ b/inference/models/llama.h @@ -44,6 +44,26 @@ class LLAMA { hidden_size = model_config["hidden_size"]; rms_norm_eps = model_config["rms_norm_eps"]; intermediate_size = model_config["intermediate_size"]; + rotary_embedding_meta.apply_rotary_embedding = true; + if (model_config.find("rope_theta") != model_config.end()) { + rotary_embedding_meta.rope_theta = model_config["rope_theta"]; + } else { + rotary_embedding_meta.rope_theta = 10000.0f; + } + if (model_config.find("scaling_factor") != model_config.end() && + !model_config["scaling_factor"].is_null()) { + rotary_embedding_meta.rope_type = + model_config["scaling_factor"]["rope_type"]; + rotary_embedding_meta.factor = + model_config["scaling_factor"]["factor"]; + rotary_embedding_meta.low_freq_factor = + model_config["scaling_factor"]["low_freq_factor"]; + rotary_embedding_meta.high_freq_factor = + model_config["scaling_factor"]["high_freq_factor"]; + rotary_embedding_meta.original_max_position_embeddings = + model_config["scaling_factor"] + ["original_max_position_embeddings"]; + } } catch (json::exception const &e) { std::cerr << "Error parsing LLAMA config from JSON file: " << e.what() << std::endl; @@ -54,8 +74,6 @@ class LLAMA { << std::endl; assert(false); } - // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -71,18 +89,17 @@ class LLAMA { std::cout << "\thidden_size: " << hidden_size << std::endl; std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl; std::cout << "\tintermediate_size: " << intermediate_size << std::endl; - - // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; - // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta + << std::endl; std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } - // int max_seq_len, max_num_tokens; int max_beam_width, max_beam_depth; int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads, hidden_size, intermediate_size; float rms_norm_eps; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_llama_model(FFModel &ff, diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index e4a7e0056d..d02c0f3b82 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -93,22 +93,35 @@ void MPT::create_mpt_model(FFModel &ff, layernorm_output = res_ln_outputs[1]; } - Tensor attn_outputs; + Tensor qkv_proj = ff.dense( + layernorm_output, + mpt_config.hidden_size * + 3, // q, k, v. need to change if want to remove replication. + // (q_heads + 2 * kv_heads) * proj_size + AC_MODE_NONE, + false, // seems like it does not use bias + DT_NONE, // what is this + nullptr, // ? + nullptr, // ? + nullptr, // ? + REG_MODE_NONE, // no regularization + 0.0f, // no dropout + std::string("layers." + std::to_string(i) + ".attn.qkv_proj").c_str()); + + Tensor o_proj; switch (mode) { case BEAM_SEARCH_MODE: { - attn_outputs = ff.spec_inc_multihead_self_attention( - layernorm_output, + o_proj = ff.spec_inc_multihead_self_attention( + qkv_proj, mpt_config.hidden_size, mpt_config.n_heads, mpt_config.hidden_size / mpt_config.n_heads, mpt_config.hidden_size / mpt_config.n_heads, 0.0f, false, - false, - false, DT_NONE, /*data_type*/ NULL, - false, + mpt_config.rotary_embedding_meta, /*scaling query*/ true, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), @@ -120,19 +133,17 @@ void MPT::create_mpt_model(FFModel &ff, break; } case TREE_VERIFY_MODE: { - attn_outputs = ff.inc_multihead_self_attention_verify( - layernorm_output, + o_proj = ff.inc_multihead_self_attention_verify( + qkv_proj, mpt_config.hidden_size, mpt_config.n_heads, mpt_config.hidden_size / mpt_config.n_heads, mpt_config.hidden_size / mpt_config.n_heads, 0.0f, false, - false, - false, DT_NONE, /*data_type*/ NULL, - false, + mpt_config.rotary_embedding_meta, /*scaling query*/ true, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), @@ -144,19 +155,17 @@ void MPT::create_mpt_model(FFModel &ff, break; } case INC_DECODING_MODE: { - attn_outputs = ff.inc_multihead_self_attention( - layernorm_output, + o_proj = ff.inc_multihead_self_attention( + qkv_proj, mpt_config.hidden_size, mpt_config.n_heads, mpt_config.hidden_size / mpt_config.n_heads, mpt_config.hidden_size / mpt_config.n_heads, 0.0f, false, - false, - false, DT_NONE, /*data_type*/ NULL, - false, + mpt_config.rotary_embedding_meta, /*scaling query*/ true, /*scaling factor*/ pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5), @@ -172,6 +181,19 @@ void MPT::create_mpt_model(FFModel &ff, } } + Tensor attn_outputs = ff.dense( + o_proj, + mpt_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".attn.o_proj").c_str()); + ff.residual_layer_norm( attn_outputs, hidden_states, diff --git a/inference/models/mpt.h b/inference/models/mpt.h index 08597e1d75..3001420ad0 100644 --- a/inference/models/mpt.h +++ b/inference/models/mpt.h @@ -37,6 +37,7 @@ class MPT { n_heads = model_config["n_heads"]; n_layers = model_config["n_layers"]; vocab_size = model_config["vocab_size"]; + rotary_embedding_meta.apply_rotary_embedding = false; } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); @@ -63,6 +64,7 @@ class MPT { // int max_seq_len, max_num_tokens; int max_beam_width, max_beam_depth; int hidden_size, n_heads, n_layers, vocab_size; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_mpt_model(FFModel &ff, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index b3f2ef4e17..34a6bb0f02 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -101,23 +101,37 @@ void OPT::create_opt_model(FFModel &ff, Tensor residual = res_ln_outputs[0]; Tensor hidden_states = res_ln_outputs[1]; - Tensor mha; + Tensor qkv_proj = ff.dense( + hidden_states, + opt_config.hidden_size * + 3, // q, k, v. need to change if want to remove replication. + // (q_heads + 2 * kv_heads) * proj_size + AC_MODE_NONE, + true, // seems like it does not use bias + DT_NONE, // what is this + nullptr, // ? + nullptr, // ? + nullptr, // ? + REG_MODE_NONE, // no regularization + 0.0f, // no dropout + std::string("layers." + std::to_string(i) + ".self_attn.qkv_proj") + .c_str()); + + Tensor o_proj; switch (mode) { case BEAM_SEARCH_MODE: { - mha = ff.spec_inc_multihead_self_attention( - hidden_states, + o_proj = ff.spec_inc_multihead_self_attention( + qkv_proj, opt_config.hidden_size, opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, /*dropout*/ - true, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - true, /*scaling query*/ + opt_config.rotary_embedding_meta, + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ @@ -128,20 +142,18 @@ void OPT::create_opt_model(FFModel &ff, break; } case TREE_VERIFY_MODE: { - mha = ff.inc_multihead_self_attention_verify( - hidden_states, + o_proj = ff.inc_multihead_self_attention_verify( + qkv_proj, opt_config.hidden_size, opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, /*dropout*/ - true, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - true, /*scaling query*/ + opt_config.rotary_embedding_meta, + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ @@ -152,20 +164,18 @@ void OPT::create_opt_model(FFModel &ff, break; } case INC_DECODING_MODE: { - mha = ff.inc_multihead_self_attention( - hidden_states, + o_proj = ff.inc_multihead_self_attention( + qkv_proj, opt_config.hidden_size, opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, opt_config.hidden_size / opt_config.num_attention_heads, 0.0f, /*dropout*/ - true, /*qkv_bias*/ - false, /*final_bias*/ false, /*add_zero_attn*/ DT_NONE, /*data_type*/ NULL, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - true, /*scaling query*/ + opt_config.rotary_embedding_meta, + true, /*scaling query*/ pow((opt_config.hidden_size / opt_config.num_attention_heads), -0.5), /*scaling factor*/ false, /*qk_prod_scaling*/ @@ -180,6 +190,20 @@ void OPT::create_opt_model(FFModel &ff, } } + Tensor mha = ff.dense( + o_proj, + opt_config.hidden_size, + AC_MODE_NONE, + false, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".self_attn.o_proj") + .c_str()); + ff.add_bias_residual_layer_norm(mha, residual, res_ln_outputs, diff --git a/inference/models/opt.h b/inference/models/opt.h index 7c736a26d1..8b85f81aa6 100644 --- a/inference/models/opt.h +++ b/inference/models/opt.h @@ -45,6 +45,7 @@ class OPT { num_hidden_layers = model_config["num_hidden_layers"]; vocab_size = model_config["vocab_size"]; word_embed_proj_dim = model_config["word_embed_proj_dim"]; + rotary_embedding_meta.apply_rotary_embedding = false; } catch (json::exception const &e) { std::cerr << "Error parsing JSON file: " << e.what() << std::endl; assert(false); @@ -54,8 +55,6 @@ class OPT { << std::endl; assert(false); } - // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -78,9 +77,8 @@ class OPT { std::cout << "\tvocab_size: " << vocab_size << std::endl; std::cout << "\tword_embed_proj_dim: " << word_embed_proj_dim << std::endl; - - // std::cout << "\tmax_seq_len: " << max_seq_len << std::endl; - // std::cout << "\tmax_num_tokens: " << max_num_tokens << std::endl; + std::cout << "\trotary_embedding_meta: " << rotary_embedding_meta + << std::endl; std::cout << "\tmax_beam_width: " << max_beam_width << std::endl; std::cout << "\tmax_beam_depth: " << max_beam_depth << std::endl; } @@ -91,6 +89,7 @@ class OPT { float dropout; int ffn_dim, hidden_size, max_position_embeddings, num_attention_heads, num_hidden_layers, vocab_size, word_embed_proj_dim; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_opt_model(FFModel &ff, diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index cd8bf3a9a7..2429b1ec1b 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -102,11 +102,28 @@ void STARCODER::create_starcoder_model( Tensor hidden_states = res_ln_outputs[0]; Tensor ln_1 = res_ln_outputs[1]; + Tensor qkv_proj = ff.dense( + ln_1, + startcoder_config.hidden_size * + 3, // q, k, v. need to change if want to remove replication. + // (q_heads + 2 * kv_heads) * proj_size + AC_MODE_NONE, + false, // seems like it does not use bias + DT_NONE, // what is this + nullptr, // ? + nullptr, // ? + nullptr, // ? + REG_MODE_NONE, // no regularization + 0.0f, // no dropout + std::string("layers." + std::to_string(i) + ".self_attention.qkv_proj") + .c_str()); + Tensor mha; + Tensor o_proj; switch (mode) { case INC_DECODING_MODE: { - mha = ff.inc_multiquery_self_attention( - ln_1, + o_proj = ff.inc_multiquery_self_attention( + qkv_proj, startcoder_config.hidden_size, startcoder_config.num_attention_heads, 1, @@ -114,17 +131,15 @@ void STARCODER::create_starcoder_model( startcoder_config.num_attention_heads, startcoder_config.hidden_size / startcoder_config.num_attention_heads, - startcoder_config.dropout_p, /*dropout*/ - true, /*bias*/ - false, /*add_bias_kv*/ - false, /*add_zero_attn*/ - DT_NONE, /*data_type*/ - nullptr, /*kernel_initializer*/ - false, /*apply_rotary_embedding*/ - false, /*scaling query*/ - 1.0f, /*scaling factor*/ - true, /*qk_prod_scaling*/ - false, /*position_bias*/ + startcoder_config.dropout_p, /*dropout*/ + false, /*add_zero_attn*/ + DT_NONE, /*data_type*/ + nullptr, /*kernel_initializer*/ + startcoder_config.rotary_embedding_meta, /*apply_rotary_embedding*/ + false, /*scaling query*/ + 1.0f, /*scaling factor*/ + true, /*qk_prod_scaling*/ + false, /*position_bias*/ std::string("layers." + std::to_string(i) + ".attn.c_attn") .c_str() /*name*/ ); @@ -135,6 +150,20 @@ void STARCODER::create_starcoder_model( } } + mha = ff.dense( + o_proj, + startcoder_config.hidden_size, + AC_MODE_NONE, + true, + DT_NONE, + nullptr, + nullptr, + nullptr, + REG_MODE_NONE, + 0.0f, + std::string("layers." + std::to_string(i) + ".self_attn.o_proj") + .c_str()); + ff.residual_layer_norm( hidden_states, mha, diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h index 0e9577d569..7ff6f33770 100644 --- a/inference/models/starcoder.h +++ b/inference/models/starcoder.h @@ -41,6 +41,7 @@ class STARCODER { intermediate_size = model_config["n_inner"]; dropout_p = model_config["attn_pdrop"]; max_position_embeddings = model_config["n_positions"]; + rotary_embedding_meta.apply_rotary_embedding = false; } catch (json::exception const &e) { std::cerr << "Error parsing STARCODER config from JSON file: " << e.what() << std::endl; @@ -51,8 +52,6 @@ class STARCODER { << std::endl; assert(false); } - // max_seq_len = BatchConfig::MAX_SEQ_LENGTH; - // max_num_tokens = BatchConfig::MAX_NUM_TOKENS; max_beam_width = BeamSearchBatchConfig::MAX_BEAM_WIDTH; max_beam_depth = BeamSearchBatchConfig::MAX_BEAM_DEPTH; } @@ -64,6 +63,7 @@ class STARCODER { int num_hidden_layers, vocab_size, num_attention_heads, hidden_size, intermediate_size, max_position_embeddings; float layer_norm_epsilon, dropout_p; + RotaryEmbeddingMeta rotary_embedding_meta; }; static void create_starcoder_model(FFModel &ff, diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index f888982f2c..1df5a05a8f 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -111,9 +111,15 @@ def main(): if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] - results = llm.generate(prompts) + if "max_length" not in configs_dict: + results = llm.generate(prompts) + else: + results = llm.generate(prompts, max_length=configs.max_length) else: - result = llm.generate("Three tips for staying healthy are: ") + if "max_length" not in configs_dict: + result = llm.generate("Three tips for staying healthy are: ") + else: + result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length) llm.stop_server() diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 7692ccb88f..a5aadc270e 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -41,6 +41,7 @@ from typing import Union, List from peft import LoraConfig import json +from dataclasses import dataclass def ffc(): @@ -2070,6 +2071,22 @@ def __init__( self.max_training_steps = max_training_steps +# ----------------------------------------------------------------------- +# RotaryEmbeddingMeta +# ----------------------------------------------------------------------- + + +@dataclass +class RotaryEmbeddingMeta: + apply_rotary_embedding: bool = False + rope_theta: float = 10000.0 + rope_type: str = "default" + factor: float = 8.0 + low_freq_factor: float = 1.0 + high_freq_factor: float = 4.0 + original_max_position_embeddings: int = 8192 + + # ----------------------------------------------------------------------- # FFModel # ----------------------------------------------------------------------- @@ -3509,12 +3526,10 @@ def inc_multihead_self_attention( kdim=0, vdim=0, dropout=0.0, - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -3543,12 +3558,6 @@ def inc_multihead_self_attention( :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 :type dropout: float(0-1) - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool @@ -3558,8 +3567,8 @@ def inc_multihead_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -3589,12 +3598,16 @@ def inc_multihead_self_attention( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -3612,12 +3625,10 @@ def spec_inc_multihead_self_attention( kdim=0, vdim=0, dropout=0.0, - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -3646,12 +3657,6 @@ def spec_inc_multihead_self_attention( :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 :type dropout: float(0-1) - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool @@ -3661,8 +3666,8 @@ def spec_inc_multihead_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -3692,12 +3697,16 @@ def spec_inc_multihead_self_attention( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -3715,12 +3724,10 @@ def inc_multihead_self_attention_verify( kdim=0, vdim=0, dropout=0.0, - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -3749,12 +3756,6 @@ def inc_multihead_self_attention_verify( :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 :type dropout: float(0-1) - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool @@ -3764,8 +3765,8 @@ def inc_multihead_self_attention_verify( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -3795,12 +3796,16 @@ def inc_multihead_self_attention_verify( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -3819,12 +3824,10 @@ def inc_multiquery_self_attention( kdim=0, vdim=0, dropout=0.0, - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -3856,12 +3859,6 @@ def inc_multiquery_self_attention( :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 :type dropout: float(0-1) - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool @@ -3871,8 +3868,8 @@ def inc_multiquery_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -3903,12 +3900,16 @@ def inc_multiquery_self_attention( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -3927,12 +3928,10 @@ def spec_inc_multiquery_self_attention( kdim=0, vdim=0, dropout=0.0, - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -3964,12 +3963,6 @@ def spec_inc_multiquery_self_attention( :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 :type dropout: float(0-1) - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool @@ -3979,8 +3972,8 @@ def spec_inc_multiquery_self_attention( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -4011,12 +4004,16 @@ def spec_inc_multiquery_self_attention( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, @@ -4035,12 +4032,10 @@ def inc_multiquery_self_attention_verify( kdim=0, vdim=0, dropout=0.0, - bias=True, - add_bias_kv=False, add_zero_attn=False, data_type=DataType.DT_NONE, kernel_initializer=None, - apply_rotary_embedding=False, + rotary_embedding_meta=RotaryEmbeddingMeta(), scaling_query=False, scaling_factor=1.0, qk_prod_scaling=True, @@ -4072,12 +4067,6 @@ def inc_multiquery_self_attention_verify( :param dropout: a Dropout layer on attn_output_weights. Default is 0.0 :type dropout: float(0-1) - :param bias: Whether the dense layers use bias vectors. Default is True. - :type bias: bool - - :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False. - :type add_bias_kv: bool - :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False. :type add_zero_attn: bool @@ -4087,8 +4076,8 @@ def inc_multiquery_self_attention_verify( :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied. :type kernel_initializer: Initializer - :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False. - :type apply_rotary_embedding: bool + :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used. + :type rotary_embedding_meta: RotaryEmbeddingMeta :param scaling_query: Whether to apply scaling query. Default is False. :type scaling_query: bool @@ -4119,12 +4108,16 @@ def inc_multiquery_self_attention_verify( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, c_data_type, kernel_init_handle, - apply_rotary_embedding, + rotary_embedding_meta.apply_rotary_embedding, + rotary_embedding_meta.rope_theta, + get_c_name(rotary_embedding_meta.rope_type), + rotary_embedding_meta.factor, + rotary_embedding_meta.low_freq_factor, + rotary_embedding_meta.high_freq_factor, + rotary_embedding_meta.original_max_position_embeddings, scaling_query, scaling_factor, qk_prod_scaling, diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 0e8fbcbd7d..0c6102406f 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -41,6 +41,17 @@ def __init__(self, hf_config): ) self.parallel_attn = hf_config.parallel_attn self.vocab_size = hf_config.vocab_size + self.rotary_embedding_meta = RotaryEmbeddingMeta( + apply_rotary_embedding=True, + rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0, + ) + if "rope_scaling" in hf_config.__dict__: + if hf_config.rope_scaling is not None: + self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"] + self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"] + self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"] + self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"] + self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"] # Standardized FlexFlow num heads fields below self.num_attention_heads = self.n_head self.num_key_value_heads = self.n_head_kv @@ -54,8 +65,6 @@ def __init__( ffconfig, hf_config, data_type, - # max_batch_size=1, - # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -63,11 +72,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.falcon_config = FalconConfig(hf_config) - # self.falcon_config.max_seq_length = max_seq_length - # self.falcon_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -138,60 +144,70 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.input_layernorm", ) + qkv_proj = ffmodel.dense( + att_norm, + 3 * self.falcon_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.self_attention.qkv_proj", + ) + if self.mode == InferenceMode.BEAM_SEARCH_MODE: - mha = ffmodel.spec_inc_multiquery_self_attention( - att_norm, + o_proj = ffmodel.spec_inc_multiquery_self_attention( + qkv_proj, self.falcon_config.hidden_size, self.falcon_config.n_head, self.falcon_config.n_head_kv, self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding + self.falcon_config.rotary_embedding_meta, name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: - mha = ffmodel.inc_multiquery_self_attention_verify( - att_norm, + o_proj = ffmodel.inc_multiquery_self_attention_verify( + qkv_proj, self.falcon_config.hidden_size, self.falcon_config.n_head, self.falcon_config.n_head_kv, self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding + self.falcon_config.rotary_embedding_meta, name=f"layers.{i}.self_attention", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multiquery_self_attention( - att_norm, + o_proj = ffmodel.inc_multiquery_self_attention( + qkv_proj, self.falcon_config.hidden_size, self.falcon_config.n_head, self.falcon_config.n_head_kv, self.falcon_config.hidden_size // self.falcon_config.n_head, self.falcon_config.hidden_size // self.falcon_config.n_head, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding + self.falcon_config.rotary_embedding_meta, name=f"layers.{i}.self_attention", ) else: assert False + mha = ffmodel.dense( + o_proj, + self.falcon_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.self_attention.o_proj" + ) + dense_h_to_4h = ffmodel.dense( att_norm, self.falcon_config.hidden_size * 4, diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index 96f0258572..e149834603 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -19,8 +19,6 @@ class LLAMAConfig: def __init__(self, hf_config): - # self.max_seq_len = 256 - # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.max_spec_tree_token_num = 20 @@ -29,6 +27,17 @@ def __init__(self, hf_config): self.hidden_size = hf_config.hidden_size self.rms_norm_eps = hf_config.rms_norm_eps self.intermediate_size = hf_config.intermediate_size + self.rotary_embedding_meta = RotaryEmbeddingMeta( + apply_rotary_embedding=True, + rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0, + ) + if "rope_scaling" in hf_config.__dict__: + if hf_config.rope_scaling is not None: + self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"] + self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"] + self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"] + self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"] + self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"] # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.num_attention_heads self.num_key_value_heads = ( @@ -55,11 +64,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.llama_config = LLAMAConfig(hf_config) - # self.llama_config.max_seq_length = max_seq_length - # self.llama_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2 ** 31 - 1 @@ -128,9 +134,17 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.input_layernorm", ) + qkv_proj = ffmodel.dense( + attn_norm, + 3 * self.llama_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.self_attn.qkv_proj", + ) + if self.mode == InferenceMode.BEAM_SEARCH_MODE: mha = ffmodel.spec_inc_multiquery_self_attention( - attn_norm, + qkv_proj, self.llama_config.hidden_size, self.llama_config.num_attention_heads, self.llama_config.num_key_value_heads, @@ -139,17 +153,15 @@ def build_model(self, max_tokens_per_batch): self.llama_config.hidden_size // self.llama_config.num_attention_heads, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding + self.llama_config.rotary_embedding_meta, name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: mha = ffmodel.inc_multiquery_self_attention_verify( - attn_norm, + qkv_proj, self.llama_config.hidden_size, self.llama_config.num_attention_heads, self.llama_config.num_key_value_heads, @@ -158,17 +170,15 @@ def build_model(self, max_tokens_per_batch): self.llama_config.hidden_size // self.llama_config.num_attention_heads, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding + self.llama_config.rotary_embedding_meta, name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: mha = ffmodel.inc_multiquery_self_attention( - attn_norm, + qkv_proj, self.llama_config.hidden_size, self.llama_config.num_attention_heads, self.llama_config.num_key_value_heads, @@ -177,20 +187,26 @@ def build_model(self, max_tokens_per_batch): self.llama_config.hidden_size // self.llama_config.num_attention_heads, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - True, # apply_rotary_embedding + self.llama_config.rotary_embedding_meta, name=f"layers.{i}.self_attn", ) else: assert False + o_proj = ffmodel.dense( + mha, + self.llama_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.self_attn.o_proj" + ) + token, ff_norm = ffmodel.residual_rms_norm( token, - mha, + o_proj, self.llama_config.rms_norm_eps, self.llama_config.hidden_size, name=f"layers.{i}.post_attention_layernorm", @@ -259,3 +275,7 @@ def convert_hf_model(model, dst_folder): for name, params in model.named_parameters(): name = FlexFlowLLAMA.convert_hf_weight_name(name) params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}") + # LM head weight + model.lm_head.weight.detach().cpu().numpy().tofile( + os.path.join(dst_folder, "lm_head.weight") + ) diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index b350ae106d..a0e70b381a 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -19,8 +19,6 @@ class MPTConfig: def __init__(self, hf_config): - # self.max_seq_len = 256 - # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.max_spec_tree_token_num = 20 @@ -28,6 +26,7 @@ def __init__(self, hf_config): self.n_heads = hf_config.n_heads self.n_layers = hf_config.n_layers self.vocab_size = hf_config.vocab_size + self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False) # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.n_heads self.num_key_value_heads = hf_config.n_heads @@ -50,11 +49,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.mpt_config = MPTConfig(hf_config) - # self.mpt_config.max_seq_length = max_seq_length - # self.mpt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -129,20 +125,26 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.norm_1", ) + qkv_proj = ffmodel.dense( + layernorm_output, + 3 * self.mpt_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.attn.qkv_proj", + ) + if self.mode == InferenceMode.BEAM_SEARCH_MODE: - attn_outputs = ffmodel.spec_inc_multihead_self_attention( - layernorm_output, + o_proj = ffmodel.spec_inc_multihead_self_attention( + qkv_proj, self.mpt_config.hidden_size, self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.mpt_config.rotary_embedding_meta, True, # scaling_query (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor @@ -151,19 +153,17 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: - attn_outputs = ffmodel.inc_multihead_self_attention_verify( - layernorm_output, + o_proj = ffmodel.inc_multihead_self_attention_verify( + qkv_proj, self.mpt_config.hidden_size, self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.mpt_config.rotary_embedding_meta, True, # scaling_query (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor @@ -172,19 +172,17 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - attn_outputs = ffmodel.inc_multihead_self_attention( - layernorm_output, + o_proj = ffmodel.inc_multihead_self_attention( + qkv_proj, self.mpt_config.hidden_size, self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, self.mpt_config.hidden_size // self.mpt_config.n_heads, 0.0, # dropout - False, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.mpt_config.rotary_embedding_meta, True, # scaling_query (self.mpt_config.hidden_size / self.mpt_config.n_heads) ** (-0.5), # scaling_factor @@ -195,6 +193,14 @@ def build_model(self, max_tokens_per_batch): else: assert False + attn_outputs = ffmodel.dense( + o_proj, + self.mpt_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.attn.o_proj" + ) + hidden_states, layernorm_output = ffmodel.residual_layer_norm( attn_outputs, hidden_states, diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index 02668abf59..ba2e21b690 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -34,6 +34,7 @@ def __init__(self, hf_config): self.num_hidden_layers = hf_config.num_hidden_layers self.vocab_size = hf_config.vocab_size self.word_embed_proj_dim = hf_config.word_embed_proj_dim + self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False) # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.num_attention_heads self.num_key_value_heads = hf_config.num_attention_heads @@ -47,8 +48,6 @@ def __init__( ffconfig, hf_config, data_type, - # max_batch_size=1, - # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -56,11 +55,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.opt_config = OPTConfig(hf_config) - # self.opt_config.max_seq_length = max_seq_length - # self.opt_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -145,20 +141,26 @@ def build_model(self, max_tokens_per_batch): hidden_states = ffmodel.add(token, positional_embedding) residual = hidden_states + qkv_proj = ffmodel.dense( + hidden_states, + 3 * self.opt_config.hidden_size, + ActiMode.AC_MODE_NONE, + True, + name=f"layers.{i}.self_attn.qkv_proj", + ) + if self.mode == InferenceMode.BEAM_SEARCH_MODE: - mha = ffmodel.spec_inc_multihead_self_attention( - hidden_states, + o_proj = ffmodel.spec_inc_multihead_self_attention( + qkv_proj, self.opt_config.hidden_size, self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout - True, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.opt_config.rotary_embedding_meta, True, # scaling_query (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor @@ -166,19 +168,17 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.TREE_VERIFY_MODE: - mha = ffmodel.inc_multihead_self_attention_verify( - hidden_states, + o_proj = ffmodel.inc_multihead_self_attention_verify( + qkv_proj, self.opt_config.hidden_size, self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout - True, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.opt_config.rotary_embedding_meta, True, # scaling_query (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor @@ -186,19 +186,17 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.self_attn", ) elif self.mode == InferenceMode.INC_DECODING_MODE: - mha = ffmodel.inc_multihead_self_attention( - hidden_states, + o_proj = ffmodel.inc_multihead_self_attention( + qkv_proj, self.opt_config.hidden_size, self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, self.opt_config.hidden_size // self.opt_config.num_attention_heads, 0.0, # dropout - True, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.opt_config.rotary_embedding_meta, True, # scaling_query (self.opt_config.hidden_size / self.opt_config.num_attention_heads) ** (-0.5), # scaling_factor @@ -208,6 +206,13 @@ def build_model(self, max_tokens_per_batch): else: assert False + mha = ffmodel.dense( + o_proj, + self.opt_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.self_attn.o_proj" + ) # This is either a before or after attention LayerNorm. In both cases, we need to compute the LN here. residual, ff_norm = ffmodel.add_bias_residual_layer_norm( mha, diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index 2d4471201f..dc5faf175f 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -19,8 +19,6 @@ class STARCODERConfig: def __init__(self, hf_config): - # self.max_seq_len = 256 - # self.max_num_tokens = 64 self.max_beam_width = 1 self.max_beam_depth = 8 self.max_spec_tree_token_num = 20 @@ -32,6 +30,7 @@ def __init__(self, hf_config): self.vocab_size = hf_config.vocab_size self.intermediate_size = hf_config.n_inner self.n_head_kv = 1 if hf_config.multi_query else hf_config.n_head + self.rotary_embedding_meta = RotaryEmbeddingMeta(apply_rotary_embedding=False) # Standardized FlexFlow num heads fields below self.num_attention_heads = hf_config.n_head self.num_key_value_heads = self.n_head_kv @@ -45,8 +44,6 @@ def __init__( ffconfig, hf_config, data_type, - # max_batch_size=1, - # max_seq_length=256, max_tokens_per_batch, weights_filepath="", tokenizer_filepath="", @@ -54,11 +51,8 @@ def __init__( self.mode = mode self.generation_config = generation_config self.ffconfig = ffconfig - # self.max_batch_size = max_batch_size self.data_type = data_type self.starcoder_config = STARCODERConfig(hf_config) - # self.starcoder_config.max_seq_length = max_seq_length - # self.starcoder_config.max_num_tokens = max_tokens_per_batch self.weights_filepath = weights_filepath self.tokenizer_filepath = tokenizer_filepath self.maxint = 2**31 - 1 @@ -142,9 +136,17 @@ def build_model(self, max_tokens_per_batch): name=f"layers.{i}.ln_1", ) - assert self.mode == InferenceMode.INC_DECODING_MODE - mha = ffmodel.inc_multiquery_self_attention( + qkv_proj = ffmodel.dense( ln_1, + 3 * self.starcoder_config.hidden_size, + ActiMode.AC_MODE_NONE, + True, + name=f"layers.{i}.self_attn.qkv_proj", + ) + + assert self.mode == InferenceMode.INC_DECODING_MODE + o_proj = ffmodel.inc_multiquery_self_attention( + qkv_proj, self.starcoder_config.hidden_size, self.starcoder_config.num_attention_heads, self.starcoder_config.n_head_kv, @@ -153,15 +155,21 @@ def build_model(self, max_tokens_per_batch): self.starcoder_config.hidden_size // self.starcoder_config.num_attention_heads, 0.0, # dropout - True, # qkv_bias - False, # final_bias False, # add_zero_attn DataType.DT_NONE, # data_type None, # kernel initializer - False, # apply_rotary_embedding + self.starcoder_config.rotary_embedding_meta, name=f"layers.{i}.attn.c_attn", ) + mha = ffmodel.dense( + o_proj, + self.starcoder_config.hidden_size, + ActiMode.AC_MODE_NONE, + False, + name=f"layers.{i}.self_attn.o_proj" + ) + residual, l2_norm = ffmodel.residual_layer_norm( hidden_states, mha, diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 532dd00198..c6cf656ac0 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1209,12 +1209,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1224,18 +1228,23 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->inc_multihead_self_attention(input, embed_dim, num_heads, kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -1252,12 +1261,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1267,6 +1280,13 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->spec_inc_multihead_self_attention(input, embed_dim, @@ -1274,12 +1294,10 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -1296,12 +1314,16 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1311,6 +1333,13 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->inc_multihead_self_attention_verify(input, embed_dim, @@ -1318,12 +1347,10 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -1341,12 +1368,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1356,6 +1387,13 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->inc_multiquery_self_attention(input, embed_dim, num_q_heads, @@ -1363,12 +1401,10 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -1386,12 +1422,16 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1401,6 +1441,13 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->spec_inc_multiquery_self_attention(input, embed_dim, @@ -1409,12 +1456,10 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -1432,12 +1477,16 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( int kdim, int vdim, float dropout, - bool bias, - bool add_bias_kv, bool add_zero_attn, enum DataType data_type, flexflow_initializer_t kernel_initializer_, bool apply_rotary_embedding, + float rope_theta, + char const *rope_type, + float rope_factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -1447,6 +1496,13 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( Tensor input = FFCObjectWrapper::unwrap(input_); Initializer *kernel_initializer = FFCObjectWrapper::unwrap(kernel_initializer_); + RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding, + rope_theta, + rope_type, + rope_factor, + low_freq_factor, + high_freq_factor, + original_max_position_embeddings); Tensor tensor = handle->inc_multiquery_self_attention_verify(input, embed_dim, @@ -1455,12 +1511,10 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify( kdim, vdim, dropout, - bias, - add_bias_kv, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc index 7a1da2e974..7bfbe31aad 100644 --- a/src/ops/add_bias_residual_layer_norm.cc +++ b/src/ops/add_bias_residual_layer_norm.cc @@ -670,8 +670,18 @@ void AddBiasResidualLayerNorm::inference_task( AddBiasResidualLayerNormMeta *m = *((AddBiasResidualLayerNormMeta **)task->local_args); - assert(regions.size() == - 4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); + int expected_regions = + 5; // input, attn_bias, residual (input), added_output, output + if (m->inplace_residual) { + expected_regions--; // input == added_output + } + if (m->elementwise_affine) { + expected_regions += 1; // gamma + if (m->use_bias) { + expected_regions += 1; // beta + } + } + assert(regions.size() == expected_regions); int rid = 0, tid = 0, did = 0; GenericTensorAccessorR input = diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 9f826cd611..2cede662f3 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -439,21 +439,13 @@ __host__ void assert(fused->op_num_outputs[op] == 1); IncMultiHeadSelfAttentionMeta *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } + assert(fused->op_num_weights[op] == 0); IncMultiHeadSelfAttention::inference_kernel_wrapper( m, bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { @@ -463,21 +455,13 @@ __host__ void (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; TreeVerifyBatchConfig const &tree_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } + assert(fused->op_num_weights[op] == 0); TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, &tree_bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { @@ -489,21 +473,13 @@ __host__ void // (BeamSearchBatchConfig *)task->args; BeamSearchBatchConfig const &beam_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } + assert(fused->op_num_weights[op] == 0); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, &beam_bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_LAYERNORM: { @@ -1025,21 +1001,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_outputs[op] == 1); IncMultiHeadSelfAttentionMeta *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } + assert(fused->op_num_weights[op] == 0); IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( m, bc, task->index_point.point_data[0], my_input_grad_accessor[0], - my_weight_accessor[0], - my_output_grad_accessor[0], - biases); + my_output_grad_accessor[0]); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 8f1212beb4..5aed2cd69a 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -448,73 +448,49 @@ __host__ void case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); + assert(fused->op_num_weights[op] == 0); IncMultiHeadSelfAttentionMeta *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } IncMultiHeadSelfAttention::inference_kernel_wrapper( m, bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); + assert(fused->op_num_weights[op] == 0); TreeIncMultiHeadSelfAttentionMeta *m = (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op]; TreeVerifyBatchConfig const &tree_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( m, &tree_bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); + assert(fused->op_num_weights[op] == 0); SpecIncMultiHeadSelfAttentionMeta const *m = (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op]; // BeamSearchBatchConfig const *beam_bc = // (BeamSearchBatchConfig *)task->args; BeamSearchBatchConfig const &beam_bc = Future(task->futures[0]).get_result(); - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( m, &beam_bc, task->index_point.point_data[0], my_input_accessor[0], - my_weight_accessor[0], - my_output_accessor[0], - biases); + my_output_accessor[0]); break; } case OP_LAYERNORM: { @@ -666,12 +642,7 @@ __host__ void assert(false && "Fusion currently does not support type"); } } - if (metas->meta[op]->inference_debugging && - !(fused->op_op_type[op] == OP_ALLREDUCE || - fused->op_op_type[op] == OP_PARALLEL_IDENTITY || - fused->op_op_type[op] == OP_REPLICATE || - fused->op_op_type[op] == OP_REPARTITION || - fused->op_op_type[op] == OP_COMBINE)) { + if (metas->meta[op]->inference_debugging) { std::vector input_accessors_to_save; std::vector weight_accessors_to_save; std::vector output_accessors_to_save; @@ -1048,21 +1019,15 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_outputs[op] == 1); IncMultiHeadSelfAttentionMeta *m = (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; - assert(fused->op_num_weights[op] == - (1 + (int)(*m->qkv_bias || *m->final_bias))); + assert(fused->op_num_weights[op] == 0); GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - assert(fused->op_num_weights[op] == 2); - biases = my_weight_accessor[1]; - } IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( m, bc, task->index_point.point_data[0], my_input_grad_accessor[0], - my_weight_accessor[0], - my_output_grad_accessor[0], - biases); + my_output_grad_accessor[0]); + // biases); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 8219cf9e1f..8dbce00ebc 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -54,23 +54,22 @@ bool IncMultiHeadSelfAttentionParams::is_valid( return is_valid; } -Tensor FFModel::inc_multihead_self_attention(const Tensor input, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { +Tensor FFModel::inc_multihead_self_attention( + const Tensor input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { return inc_multiquery_self_attention(input, embed_dim, num_heads, @@ -78,12 +77,10 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, kdim, vdim, dropout, - qkv_bias, - final_bias, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -91,31 +88,29 @@ Tensor FFModel::inc_multihead_self_attention(const Tensor input, name); } -Tensor FFModel::inc_multiquery_self_attention(const Tensor input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { +Tensor FFModel::inc_multiquery_self_attention( + const Tensor input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; } DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE; bool offload = cpu_offload; Layer *li = nullptr; - int weight_num = (qkv_bias || final_bias) ? 2 : 1; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); li = new Layer(this, @@ -123,7 +118,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0, 1 /*outputs*/, casted_input); } else { @@ -132,7 +127,7 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0, 1 /*outputs*/, input); } @@ -142,65 +137,30 @@ Tensor FFModel::inc_multiquery_self_attention(const Tensor input, for (int i = 0; i < numdims; i++) { dims[i] = input->dims[i]; } - dims[0] = embed_dim; + dims[0] = vdim * num_q_heads; // we now output o_proj_dim * o_heads li->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, li, 0, true /*create_grad*/); } - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - // allocate num_q_heads for key, value for replication - int weight_size = qParas * num_q_heads + kParas * num_q_heads + - vParas * num_q_heads + oParas * num_q_heads; - int one_head_size = qParas + kParas + vParas + oParas; - - { - // compress the weight size if quantization. - if (quantization_type != DT_NONE) { - one_head_size = get_quantization_to_byte_size( - data_type, quantization_type, one_head_size); - } - int dims[1] = {weight_size}; - li->weights[0] = create_weight_legion_ordering( - 1, - dims, - quantization_type == DT_NONE ? data_type : quantization_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (qkv_bias || final_bias) { - // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; - li->weights[1] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_q_heads", num_q_heads); li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); - li->add_int_property("qkv_bias", qkv_bias); - li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("apply_rotary_embedding", + rotary_embedding_meta.apply_rotary_embedding); + li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta); + li->add_string_property("rope_type", rotary_embedding_meta.rope_type); + li->add_float_property("factor", rotary_embedding_meta.factor); + li->add_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + li->add_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + li->add_int_property("original_max_position_embeddings", + rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); @@ -231,14 +191,20 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( int vdim = value; float dropout; layer->get_float_property("dropout", dropout); - layer->get_int_property("qkv_bias", value); - bool qkv_bias = (bool)value; - layer->get_int_property("final_bias", value); - bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; + rotary_embedding_meta.apply_rotary_embedding = (bool)value; + layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta); + layer->get_string_property("rope_type", rotary_embedding_meta.rope_type); + layer->get_float_property("factor", rotary_embedding_meta.factor); + layer->get_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + layer->get_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + layer->get_int_property("original_max_position_embeddings", value); + rotary_embedding_meta.original_max_position_embeddings = (int)value; layer->get_int_property("scaling_query", value); bool scaling_query = (bool)value; float scaling_factor; @@ -264,15 +230,12 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer( kdim, vdim, dropout, - qkv_bias, - final_bias, add_zero_attn, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, - false /*allocate_weights*/, quantization_type, offload, tensor_parallelism_degree, @@ -289,15 +252,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, @@ -308,13 +268,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1), /*weights*/ + 0, 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), + rotary_embedding_meta(_rotary_embedding_meta), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), @@ -334,86 +293,29 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( x *= _input->dims[i].size; } dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - - if (quantization_type != DT_NONE) { - dims[1].size = get_quantization_to_byte_size( - data_type, quantization_type, (qParas + kParas + vParas + oParas)); - } - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>( - dims, - quantization_type == DT_NONE ? this->data_type : quantization_type, - nullptr /*owner_op*/, - model.config.computationMode == COMP_MODE_INFERENCE - ? false - : true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } + // Removed restriction that no parallelism along this dim + // assert(dims[0].degree == 1); outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* // Check correctness */ /* assert(check_output_input_weight_parallel_dims()); */ } IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( FFModel &model, const ParallelTensor _input, - const ParallelTensor _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, @@ -424,14 +326,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1), /*weights*/ + 0, 1 /*outputs*/, - _input, - _weight), + _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), + rotary_embedding_meta(_rotary_embedding_meta), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), @@ -439,9 +339,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( scaling_query(_scaling_query), scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), quantization_type(_quantization_type), offload(_offload), - tensor_parallelism_degree(_tensor_parallelism_degree) -// bias_initializer(_bias_initializer) -{ + tensor_parallelism_degree(_tensor_parallelism_degree) { numOutputs = 1; int numdim = _input->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; @@ -451,63 +349,10 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( dims[0].size = _embed_dim; // Currently require no parallelism along this dim assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - // dims[2].size = this->num_q_heads * (qParas + oParas) + this->num_kv_heads - // * (kParas + vParas); - if (quantization_type != DT_NONE) { - dims[1].size = get_quantization_to_byte_size( - data_type, quantization_type, (qParas + kParas + vParas + oParas)); - } - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>( - dims, - quantization_type == DT_NONE ? this->data_type : quantization_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ - /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ // Check correctness /* assert(check_output_input_weight_parallel_dims()); */ } @@ -515,8 +360,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( FFModel &model, IncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights) + const ParallelTensor input) : IncMultiHeadSelfAttention(model, other.layer_guid, input, @@ -526,15 +370,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( other.qProjSize, other.vProjSize, other.dropout, - other.qkv_bias, - other.final_bias, other.add_zero_attn, - other.apply_rotary_embedding, + other.rotary_embedding_meta, other.scaling_query, other.scaling_factor, other.qk_prod_scaling, other.position_bias, - allocate_weights, other.quantization_type, other.offload, other.tensor_parallelism_degree, @@ -544,7 +385,6 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( FFModel &model, IncMultiHeadSelfAttentionParams const ¶ms, ParallelTensor const &input, - bool allocate_weights, char const *name) : IncMultiHeadSelfAttention(model, params.layer_guid, @@ -555,15 +395,12 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention( params.kdim, params.vdim, params.dropout, - params.qkv_bias, - params.final_bias, params.add_zero_attn, - params.apply_rotary_embedding, + params.rotary_embedding_meta, params.scaling_query, params.scaling_factor, params.qk_prod_scaling, params.position_bias, - allocate_weights, params.quantization_type, params.offload, params.tensor_parallelism_degree, @@ -596,20 +433,12 @@ void IncMultiHeadSelfAttention::init_inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -636,18 +465,12 @@ void IncMultiHeadSelfAttention::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -655,8 +478,7 @@ void IncMultiHeadSelfAttention::init(FFModel const &ff) { /* regions[0](I): input - regions[1](I): weight - regions[2](O): output + regions[1](O): output */ OpMeta *IncMultiHeadSelfAttention::init_task( Task const *task, @@ -675,17 +497,10 @@ OpMeta *IncMultiHeadSelfAttention::init_task( FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], + regions[1], + task->regions[1], FID_DATA, ctx, runtime); @@ -698,8 +513,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task( attn->num_kv_heads / attn->tensor_parallelism_degree + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (attn->offload) { @@ -708,14 +521,8 @@ OpMeta *IncMultiHeadSelfAttention::init_task( gpu_mem_allocator.register_reserved_work_space( handle.offload_reserve_space, handle.offload_reserve_space_size); } - IncMultiHeadSelfAttentionMeta *m = - new IncMultiHeadSelfAttentionMeta(handle, - attn, - weight, - gpu_mem_allocator, - num_samples, - num_q_heads, - num_kv_heads); + IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta( + handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads); if (handle.offload_reserve_space == nullptr) { // assert that we didn't over allocate memory assert(gpu_mem_allocator.reserved_allocated_size == @@ -725,10 +532,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task( m->inference_debugging = attn->inference_debugging; std::strcpy(m->op_name, attn->name); m->layer_guid = attn->layer_guid; - if (attn->quantization_type == DT_NONE) { - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); - } return m; } @@ -770,14 +573,6 @@ FutureMap IncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, @@ -785,23 +580,12 @@ FutureMap IncMultiHeadSelfAttention::inference( batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (qkv_bias || final_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } /* regions[0](I): input - regions[3](I): weight - regions[4](O): output + regions[1](O): output */ void IncMultiHeadSelfAttention::inference_task( Task const *task, @@ -822,54 +606,31 @@ void IncMultiHeadSelfAttention::inference_task( IncMultiHeadSelfAttentionMeta *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); + assert(regions.size() == 2); // input and output GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[1].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); assert(task->index_point.get_dim() == 1); IncMultiHeadSelfAttention::inference_kernel_wrapper( - m, bc, task->index_point.point_data[0], input, weight, output, biases); + m, bc, task->index_point.point_data[0], input, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - std::vector weights_accessors; - weights_accessors.push_back(weight); - if (*m->qkv_bias || *m->final_bias) { - weights_accessors.push_back(biases); - } IncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, bc, {input}, weights_accessors, {output}); + m, shard_id, bc, {input}, {}, {output}); } } @@ -903,14 +664,6 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd( EXCLUSIVE, batch_inputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement( RegionRequirement(batch_outputs[0]->part_grad, 0 /*projection id*/, @@ -918,23 +671,12 @@ FutureMap IncMultiHeadSelfAttention::peft_bwd( EXCLUSIVE, batch_outputs[0]->region_grad)); launcher.add_field(idx++, FID_DATA); - if (qkv_bias || final_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } /* regions[0](I): input - regions[3](I): weight - regions[4](O): output + regions[1](O): output */ void IncMultiHeadSelfAttention::peft_bwd_task( Task const *task, @@ -954,55 +696,31 @@ void IncMultiHeadSelfAttention::peft_bwd_task( IncMultiHeadSelfAttentionMeta *m = *((IncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); + assert(regions.size() == 2); // input grad, output grad GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain input_grad_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); Domain output_grad_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[1].region.get_index_space()); assert(input_grad_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); assert(output_grad_domain.get_dim() == 4); assert(task->index_point.get_dim() == 1); IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( - m, - bc, - task->index_point.point_data[0], - input_grad, - weight, - output_grad, - biases); + m, bc, task->index_point.point_data[0], input_grad, output_grad); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; IncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false); + m, shard_id, bc, {input_grad}, {}, {output_grad}, false); } } @@ -1032,9 +750,20 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.rotary_embedding_meta.apply_rotary_embedding == + rhs.rotary_embedding_meta.apply_rotary_embedding && + lhs.rotary_embedding_meta.rope_theta == + rhs.rotary_embedding_meta.rope_theta && + lhs.rotary_embedding_meta.rope_type == + rhs.rotary_embedding_meta.rope_type && + lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor && + lhs.rotary_embedding_meta.low_freq_factor == + rhs.rotary_embedding_meta.low_freq_factor && + lhs.rotary_embedding_meta.high_freq_factor == + rhs.rotary_embedding_meta.high_freq_factor && + lhs.rotary_embedding_meta.original_max_position_embeddings == + rhs.rotary_embedding_meta.original_max_position_embeddings && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && lhs.qk_prod_scaling == rhs.qk_prod_scaling && @@ -1049,10 +778,8 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const { params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; - params.qkv_bias = this->qkv_bias; - params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; + params.rotary_embedding_meta = this->rotary_embedding_meta; params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; @@ -1081,10 +808,15 @@ size_t hash::operator()( hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); - hash_combine(key, params.qkv_bias); - hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.rope_theta); + hash_combine(key, params.rotary_embedding_meta.rope_type); + hash_combine(key, params.rotary_embedding_meta.factor); + hash_combine(key, params.rotary_embedding_meta.low_freq_factor); + hash_combine(key, params.rotary_embedding_meta.high_freq_factor); + hash_combine(key, + params.rotary_embedding_meta.original_max_position_embeddings); hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 826fea4347..a4604a11a2 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -19,6 +19,7 @@ #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/utils/hip_helper.h" #include "hip/hip_complex.h" +#include #include namespace FlexFlow { @@ -52,6 +53,339 @@ __device__ __forceinline__ T #endif } +template +__global__ void store_kv_cache(DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int num_tokens, + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; + + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; + + // key cache + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; + } +} + +template +__global__ void store_query_cache(DT const *devQKVProjArray, + DT *qCache_ptr, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; + + size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; + + DT qVal = devQKVProjArray[val_idx]; + + // query cache + qCache_ptr[i] = qVal; + } +} + +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; + } +} + +template +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + hipStream_t stream) { + checkCUDA(hipblasSetStream(m->handle.blas, stream)); + checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); + hipblasDatatype_t compute_type = cublas_data_type; + + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize); + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { + continue; + } + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + // Copy query to m->query_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed > m->allocated_peft_buffer_size1) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size1 = activation_size_needed; + } + int parallelism = m->hidden_size * num_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + static_cast
(m->devQKVProjArray), + static_cast
(m->query_activation_buffer), + num_tokens, + m->hidden_size); + } + // Step 1: compute query-key product QK.T/sqrt(d_k) + { + // Scale by sqrt(d_k) as per the original attention paper + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // after transpositions + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + // before transpositions + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + // N.B. strides are applied before transpose operations + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // matrix A: devQKVProjArray + // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] + // To get query projection, skip over Q entries from previous requests + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // matrix B: key cache + // matrix B's layout: [kProjSize * num_heads, total_tokens] + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Step 2: Add alibi position bias to qk production + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } + + // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods + // with -inf to force causal attention. + assert(num_new_tokens <= total_tokens); + size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + entries_above_diagonal, + static_cast
(-INFINITY)); + } + + // Step 4: Compute Softmax(QK.T/sqrt(d_k)) + { + // Before modifying the parameters below, make sure to read the following + // description of the HIPDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + } + // Copy C_softmax to m->softmax_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; + if (activation_size_needed > m->allocated_peft_buffer_size2) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size2 = activation_size_needed; + } + checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer, + C_softmax, + sizeof(DT) * total_tokens * num_new_tokens * + m->num_q_heads, + hipMemcpyDeviceToDevice, + stream)); + } + // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ + // softmax(QK.T/sqrt(d_k)).T + { + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->vProjSize; + int n = num_new_tokens; + int k = total_tokens; + // before transpositions + int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + // N.B. strides are applied before transpose operations + int strideA = vt_block_size; + int strideB = num_new_tokens * total_tokens; + int strideC = m->vProjSize; + // matrix A: value cache + // matrix A's layout: [vProjSize, num_heads, total_tokens] + // To get A, skip over V.T entries from previous requests (all heads + + // padding) + DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix B: qk_prods_softmax + // matrix B's layout: [num_new_tokens, total_tokens, num_heads] + // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous + // requests (all heads) + DT *B = static_cast
(m->qk_prods_softmax); + // matrix C: attn heads + // matrix C's layout: [vProjSize, num_heads, num_new_tokens] + // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous + // requests + // store the result attn heads, also skip the genration tokens + DT *C = static_cast
(m->attn_heads) + + (bc->requestsInfo[i].first_token_offset_in_batch) * + m->num_q_heads * m->vProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + tokens_previous_requests += num_new_tokens; + } + if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { + bc->print(); + printf("tokens_previous_requests: %i\n", tokens_previous_requests); + printf("num_tokens: %i\n", num_tokens); + printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); + } + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); +} + // gridDim = num_heads // blockDim = num_tokens/num_request * head_size // QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads| @@ -334,63 +668,6 @@ __global__ void apply_position_bias_qkprd(DT *input_ptr, } } -template -__global__ void apply_proj_bias_w(DT *input_ptr, - DT const *bias_ptr, - int num_tokens, - int qkv_weight_size, - int oProjSize) { - CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { - int bias_idx = qkv_weight_size + i % oProjSize; - input_ptr[i] += bias_ptr[bias_idx]; - } -} - -template -__global__ void apply_proj_bias_qkv(DT *input_ptr, - DT const *bias_ptr, - int shard_id, - int num_tokens, - int qProjSize, - int kProjSize, - int vProjSize, - int global_num_q_heads, - int num_q_heads, - bool scaling_query, - float scaling_factor, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) { - // for simplicity, assume q, k, v is in same shape - // 0->q, 1->k, 2->v - // int qkv_index = i / (num_tokens * qProjSize) % 3; - - int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); - size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM; - - int qkv_index = in_token_idx / hidden_size; - - int proj_size = qkv_index == 0 ? qProjSize : kProjSize; - - int head_idx = - (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size; - int global_head_idx = head_idx + shard_id * num_q_heads; - - size_t pre_length = - qkv_index == 0 - ? 0 - : (qkv_index == 1 ? qProjSize * global_num_q_heads - : qProjSize * global_num_q_heads * KV_WEIGHT_NUM); - - size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size; - - input_ptr[i] += bias_ptr[bias_idx]; - - if (scaling_query && qkv_index == 0) { - input_ptr[i] *= scaling_factor; - } - } -} - template __global__ void scaling_query_kernel(DT *input_ptr, int qProjSize, @@ -405,60 +682,17 @@ __global__ void scaling_query_kernel(DT *input_ptr, } } -template -__global__ void - apply_rotary_embedding_native(DT *input_ptr, - hipFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_q_heads, - int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size) { - CUDA_KERNEL_LOOP( - i, - num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { - // create complex number - bool q_tensor = i < (q_array_size / 2); - int proj_size = q_tensor ? qProjSize : kProjSize; - int real_i = q_tensor ? i : i - q_array_size / 2; - - int head_idx = real_i / (num_tokens * proj_size / 2); - int idx = real_i % (num_tokens * proj_size / 2); - int real_part_index = idx * 2 + - head_idx * (q_tensor ? q_block_size : k_block_size) + - (q_tensor ? 0 : q_array_size); - - int complex_part_index = real_part_index + 1; - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - int token_idx = - (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - // complex_input[i].y; - - int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - hipFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = hipCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - template __global__ void apply_rotary_embedding_hf(DT *input_ptr, hipFloatComplex *complex_input, BatchConfig::PerTokenInfo const *tokenInfos, + float rope_theta, + bool llama3_rope, + float factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, int qProjSize, int kProjSize, int num_tokens, @@ -493,7 +727,29 @@ __global__ void // float before_real = complex_input[i].x, before_complex = int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); + + float freq = + pos * (1.0 / pow(rope_theta, (float)2 * pos_i / proj_size)); // θ_i + + if (llama3_rope) { + float pi = HIP_PI_F; + float wavelen = 2 * pi / freq; + float low_freq_wavelen = + original_max_position_embeddings / low_freq_factor; + float high_freq_wavelen = + original_max_position_embeddings / high_freq_factor; + if (wavelen < high_freq_wavelen) { + } else if (wavelen > low_freq_wavelen) { + freq = freq / factor; + } else { + assert(low_freq_wavelen != high_freq_wavelen); + float smooth = + (original_max_position_embeddings / wavelen - low_freq_factor) / + (high_freq_factor - low_freq_factor); + freq = ((1 - smooth) * freq / factor + smooth * freq); + } + } + hipFloatComplex complex_pos = {cos(freq), sin(freq)}; complex_input[i] = hipCmulf(complex_input[i], complex_pos); @@ -507,6 +763,12 @@ __global__ void apply_rotary_embedding_bwd(DT *input_ptr, hipFloatComplex *complex_input, BatchConfig::PerTokenInfo const *tokenInfos, + float rope_theta, + bool llama3_rope, + float factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, int proj_size, int num_tokens, int hidden_size) { @@ -533,7 +795,28 @@ __global__ void size_t pos = tokenInfos[token_idx].abs_depth_in_request; - float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); + float freq = + pos * (1.0 / pow(rope_theta, (float)2 * idx / proj_size)); // θ_i + + if (llama3_rope) { + float pi = HIP_PI_F; + float wavelen = 2 * pi / freq; + float low_freq_wavelen = + original_max_position_embeddings / low_freq_factor; + float high_freq_wavelen = + original_max_position_embeddings / high_freq_factor; + if (wavelen < high_freq_wavelen) { + } else if (wavelen > low_freq_wavelen) { + freq = freq / factor; + } else { + assert(low_freq_wavelen != high_freq_wavelen); + float smooth = + (original_max_position_embeddings / wavelen - low_freq_factor) / + (high_freq_factor - low_freq_factor); + freq = ((1 - smooth) * freq / factor + smooth * freq); + } + } + hipFloatComplex complex_pos = {cos(freq), sin(freq)}; complex_input[i] = hipCmulf(complex_input[i], complex_pos); @@ -542,172 +825,59 @@ __global__ void } } -template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; - } -} - template void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, - DT const *input_ptr, - DT const *weight_ptr, DT *output_ptr, - DT const *bias_ptr, hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); assert(m->qSize == m->vSize && m->qSize == m->kSize); - hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - hipblasDatatype_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // hipblasDatatype_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - - // Step 1: Compute QKV projections - { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_q = m->qProjSize * m->num_q_heads; - int m_k = m->kProjSize * m->num_q_heads; - int m_v = m->vProjSize * m->num_q_heads; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_infr_tokens(); - int k = m->qSize; - int m_ = m_q * QKV_WEIGHT_NUM; - // before transpositions - int lda = k, ldb = k, ldc = m_; - // matrix A: QKV weights - // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] - // matrix B: input - // matrix B's layout: [qSize (hidden_dim), num_new_tokens] - // matrix C: devQKVProjArray - // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - cublas_data_type, - lda, - input_ptr, - cublas_data_type, - ldb, - &beta, - output_ptr, - cublas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } int num_tokens = bc->num_active_tokens(); int parallelism = m->kProjSize * num_tokens * m->num_q_heads; size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; - // Step 2: apply bias for QKV, or scale the query - if (*m->qkv_bias) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - shard_id, - num_tokens, - m->qProjSize, - m->kProjSize, - m->vProjSize, - m->global_num_q_heads, - m->num_q_heads, - *m->scaling_query, - m->scaling_factor, - m->hidden_size); - } else if (m->scaling_query) { + if (m->scaling_query) { hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, stream, output_ptr, + m->qProjSize, num_tokens, m->num_q_heads, - m->qProjSize, m->scaling_factor, m->hidden_size); } // Step 3: apply rotary embedding if needed - if (*m->apply_rotary_embedding) { + if (m->rotary_embedding_meta->apply_rotary_embedding) { /*q&k*/ parallelism = num_tokens * m->hidden_size; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - num_tokens, - q_array_size, - m->hidden_size); - } -} - -template -__global__ void store_kv_cache(DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - BatchConfig::PerTokenInfo const *tokenInfos, - int num_tokens, - int max_seq_len, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - int offset = i % hidden_size; - - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - - // key cache - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; + hipLaunchKernelGGL( + HIP_KERNEL_NAME(apply_rotary_embedding_hf), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + output_ptr, + m->complex_input, + m->token_infos, + m->rotary_embedding_meta->rope_theta, + (m->rotary_embedding_meta->rope_type == "llama3"), + m->rotary_embedding_meta->factor, + m->rotary_embedding_meta->low_freq_factor, + m->rotary_embedding_meta->high_freq_factor, + m->rotary_embedding_meta->original_max_position_embeddings, + m->qProjSize, + m->kProjSize, + num_tokens, + q_array_size, + m->hidden_size); } } @@ -723,91 +893,13 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, min(CUDA_NUM_THREADS, parallelism), 0, stream, - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - num_tokens, - BatchConfig::max_sequence_length(), - m->hidden_size); - } -} - -template -void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *weight_ptr, - DT const *bias_ptr, - int num_tokens, - hipStream_t stream) { - hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - hipblasDatatype_t compute_type = HIPBLAS_R_16F; -#else - hipblasDatatype_t compute_type = cublas_data_type; -#endif - // Project to output, save result directly on output tensor - { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = num_tokens; - // before transpositions - int lda = k, ldb = k, ldc = m_; - // matrix A: output projection weight - // matrix A's layout: [vProjSize * num_heads, oProjSize] - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - // matrix B: attn heads - // matrix B's layout: [vProjSize * num_heads, num_new_tokens] - DT const *B = static_cast
(m->attn_heads); - // matrix B: output - // matrix B's layout: [oProjSize, num_new_tokens] - DT *C = static_cast
(output_ptr); - - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - // Add final output bias - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, + static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, num_tokens, - qkv_weight_size, - m->oProjSize); + BatchConfig::max_sequence_length(), + m->hidden_size); } } @@ -856,93 +948,43 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, } } -template -void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - hipStream_t stream) { - // additional processing for weight uploading - // Note that we update weight_ptr and bias_ptr when uploading weight and - // bias - if (m->quantization_type != DT_NONE) { - // copy weight_ptr to quantized_weight_ptr, do compression and store in - // m->weight_ptr - checkCUDA(hipMemcpyAsync(m->quantized_weight_ptr, - weight.get_byte_ptr(), - m->quantized_weightSize, - hipMemcpyHostToDevice, - stream)); - - if (m->quantization_type == DT_INT4) { - int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; - hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int4_attention_weights), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); - } else { - assert(m->quantization_type == DT_INT8); - int parallelism = m->qProjSize * m->qSize * m->num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int8_attention_weights), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); - } - } else { - if (data_type == DT_FLOAT) { - checkCUDA(hipMemcpyAsync(m->weight_ptr, - weight.get_float_ptr(), - m->weightSize, - hipMemcpyHostToDevice, - stream)); - } else if (data_type == DT_HALF) { - checkCUDA(hipMemcpyAsync(m->weight_ptr, - weight.get_half_ptr(), - m->weightSize, - hipMemcpyHostToDevice, - stream)); - } else { - assert(false); - } +std::string get_fwd_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); } template void inference_kernel(IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, - DT const *input_ptr, - DT const *weight_ptr, + DT const *qkv_ptr, DT *output_ptr, - DT const *bias_ptr, hipStream_t stream) { - if (m->offload && m->biasSize > 0) { - checkCUDA(hipMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); - bias_ptr = static_cast
(m->bias_ptr); - } + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); - // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + hipMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), + hipMemcpyDeviceToDevice, + stream); + + // phase 1: Implement kernel to apply rotary embedding and scaling + compute_qkv_kernel( + m, bc, shard_id, static_cast
(m->devQKVProjArray), stream); update_kv_cache_kernel
(m, bc, stream); if (bc->num_generation_tokens > 0) { @@ -953,14 +995,16 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m, if (bc->num_tokens > bc->num_generation_tokens) { // phase 4: Compute attention score for prompt tokens; - compute_attention_kernel_prompt( - m, bc, shard_id, bias_ptr, weight_ptr, stream); + compute_attention_kernel_prompt
(m, bc, shard_id, stream); } // compute output production and bias together for all tokens int num_tokens = bc->num_active_tokens(); - compute_o_prod_bias( - m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); + hipMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + hipMemcpyDeviceToDevice, + stream); } std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, @@ -978,14 +1022,75 @@ std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, return dst_filepath.string(); } +__global__ void transposeAdd_half_kernel( + half *out, half const *in, int width, int height, half alpha, half beta) { + int t_id = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (int i = t_id; i < width * height; i += num_threads) { + int row = i / width; + int col = i % width; + out[col * height + row] = + alpha * in[row * width + col] + beta * out[col * height + row]; + } +} + +__global__ void transposeAdd_float_kernel(float *out, + float const *in, + int width, + int height, + float alpha, + float beta) { + int t_id = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (int i = t_id; i < width * height; i += num_threads) { + int row = i / width; + int col = i % width; + out[col * height + row] = + alpha * in[row * width + col] + beta * out[col * height + row]; + } +} + +template +void transposeAdd(DT *out, + const DT *in, + int width, + int height, + float alpha, + float beta, + hipStream_t stream) { + assert(false && "Unsupported data type"); +} + +template <> +void transposeAdd(float *out, + float const *in, + int width, + int height, + float alpha, + float beta, + hipStream_t stream) { + transposeAdd_float_kernel<<<4, 1024, 0, stream>>>( + out, in, width, height, alpha, beta); +} + +template <> +void transposeAdd(half *out, + half const *in, + int width, + int height, + float alpha, + float beta, + hipStream_t stream) { + transposeAdd_half_kernel<<<4, 1024, 0, stream>>>( + out, in, width, height, __float2half(alpha), __float2half(beta)); +} + template void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, DT *input_grad_ptr, - DT const *weight_ptr, DT const *output_grad_ptr, - DT const *bias_ptr, hipStream_t stream) { assert(!m->offload); checkCUDA(hipblasSetStream(m->handle.blas, stream)); @@ -994,17 +1099,6 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); hipblasDatatype_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // hipblasDatatype_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif for (int i = 0; i < bc->max_requests_per_batch(); i++) { if (bc->request_completed[i]) { @@ -1026,47 +1120,18 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, int vt_req_block_size = vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - // Step 1: compute gradients before final projection + // Step 1: copy gradient before final projection into workspace { int m_ = m->vProjSize * m->num_q_heads; int n_ = num_tokens; - int k_ = m->oProjSize; - int lda = m_; - int ldb = k_; - int ldc = m_; - float alpha = 1.0f, beta = 0.0f; - // matrix A: output projection weight - // matrix A's layout: [vProjSize * num_heads, oProjSize] - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - // matrix B: output gradients - // matrix B's layout: [oProjSize, num_new_tokens] - DT const *B = - output_grad_ptr + - bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; - // matrix C: attn_heads gradients - // matrix C's layout: [vProjSize * num_heads, num_new_tokens] DT *C = static_cast
(m->handle.workSpace); - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_N, - m_, - n_, - k_, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + hipMemcpyAsync(C, + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * + m->oProjSize, + m_ * n_ * sizeof(DT), + hipMemcpyDeviceToDevice, + stream); if (m->inference_debugging) { // save result to file for checking std::string filename = @@ -1331,264 +1396,15 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, int lda = num_tokens; // num_new_tokens int ldb = m->qProjSize * m->num_q_heads; int ldc = num_tokens; - int strideA = num_tokens * num_tokens; - int strideB = m->qProjSize; - int strideC = num_tokens * m->qProjSize; - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n_, - k_, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; - save_tensor(C, - num_tokens * m->qProjSize * m->num_q_heads * 3, - filename.c_str()); - } - } - - // Step 7: perform rotary position embeddings (RoPE) bwd - { - if (*m->apply_rotary_embedding) { - assert(m->hidden_size == m->qProjSize * m->num_q_heads); - assert(m->qProjSize == m->kProjSize); - /*q&k*/ - int parallelism = num_tokens * m->hidden_size; - DT *A = static_cast
(m->devQKVProjArray); - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_bwd), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - A, - m->complex_input, - m->token_infos, - m->qProjSize, - num_tokens, - m->hidden_size); - DT *C = static_cast
(m->devQKVProjArray); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; - save_tensor(C, - num_tokens * m->qProjSize * m->num_q_heads * 3, - filename.c_str()); - } - } - - // matrix C: gradients for key (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = - static_cast
(m->devQKVProjArray) + - num_tokens * - (m->qProjSize * - m->num_q_heads); // skip over regions reserved for Q gradients - if (m->inference_debugging) { - std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; - save_tensor( - C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); - } - } - - // Step 8: compute gradients w.r.t. input - { - float alpha = 1.0f, beta = 0.0f; - if (!m->reset_input_grads[0]) { - beta = 1.0f; - } - // matrix A: QKV projection weights - // matrix A's layout: [qSize, qProjSize * num_q_heads, 3] - DT const *A = weight_ptr; - // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) - // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] - DT const *B = static_cast
(m->devQKVProjArray); - // matrix C: gradients w.r.t. input - // matrix C's layout: [m->qSize, num_tokens] - DT *C = input_grad_ptr + - bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; - int m_ = m->qSize; - int n_ = num_tokens; - int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); - int lda = m_; - int ldb = n_; - int ldc = m_; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n_, - k_, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; - save_tensor(C, num_tokens * m->qSize, filename.c_str()); - } - } - } -} - -} // namespace IncMultiHeadAttention -} // namespace Kernels - -using namespace Kernels::IncMultiHeadAttention; - -template -__global__ void store_query_cache(DT const *devQKVProjArray, - DT *qCache_ptr, - int num_tokens, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - int offset = i % hidden_size; - - size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; - - DT qVal = devQKVProjArray[val_idx]; - - // query cache - qCache_ptr[i] = qVal; - } -} - -template -void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, - BatchConfig const *bc, - int shard_id, - DT const *bias_ptr, - DT const *weight_ptr, - hipStream_t stream) { - checkCUDA(hipblasSetStream(m->handle.blas, stream)); - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); - hipblasDatatype_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // hipblasDatatype_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int q_block_size = m->qProjSize; - int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || - (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { - continue; - } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; - // Copy query to m->query_activation_buffer if we need to compute - // PEFT backward - if (bc->requestsInfo[i].peft_bwd) { - size_t activation_size_needed = - sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; - if (activation_size_needed > m->allocated_peft_buffer_size1) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->query_activation_buffer = - allocator->allocate_instance_untyped(activation_size_needed); - m->allocated_peft_buffer_size1 = activation_size_needed; - } - int parallelism = m->hidden_size * num_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - static_cast
(m->devQKVProjArray), - static_cast
(m->query_activation_buffer), - num_tokens, - m->hidden_size); - } - // Step 1: compute query-key product QK.T/sqrt(d_k) - { - // Scale by sqrt(d_k) as per the original attention paper - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // after transpositions - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - // before transpositions - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - // N.B. strides are applied before transpose operations - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // matrix A: devQKVProjArray - // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] - // To get query projection, skip over Q entries from previous requests - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // matrix B: key cache - // matrix B's layout: [kProjSize * num_heads, total_tokens] - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // matrix C: qk_prods - // matrix C's layout: [num_new_tokens, total_tokens, num_heads] - // To get C, skip over QK.T products from previous requests - DT *C = static_cast
(m->qk_prods); + int strideA = num_tokens * num_tokens; + int strideB = m->qProjSize; + int strideC = num_tokens * m->qProjSize; checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, HIPBLAS_OP_N, + HIPBLAS_OP_T, m_, - n, - k, + n_, + k_, &alpha, A, cublas_data_type, @@ -1606,177 +1422,111 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, m->num_q_heads, compute_type, HIPBLAS_GEMM_DEFAULT)); - } - // Step 2: Add alibi position bias to qk production - // matrix C: qk_prods - // matrix C's layout: [num_new_tokens, total_tokens, num_heads] - // To get C, skip over QK.T products from previous requests - DT *C = static_cast
(m->qk_prods); - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd), - GET_BLOCKS(parallelism), - min((size_t)CUDA_NUM_THREADS, parallelism), - 0, - stream, - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - - // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods - // with -inf to force causal attention. - assert(num_new_tokens <= total_tokens); - size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; - if (entries_above_diagonal > 0) { - size_t parallelism = m->num_q_heads * entries_above_diagonal; - hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal), - GET_BLOCKS(parallelism), - min((size_t)CUDA_NUM_THREADS, parallelism), - 0, - stream, - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - entries_above_diagonal, - static_cast
(-INFINITY)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } } - // Step 4: Compute Softmax(QK.T/sqrt(d_k)) + // Step 7: perform rotary position embeddings (RoPE) bwd { - // Before modifying the parameters below, make sure to read the following - // description of the HIPDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(miopenSet4dTensorDescriptor( - m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); - } - // Copy C_softmax to m->softmax_activation_buffer if we need to compute - // PEFT backward - if (bc->requestsInfo[i].peft_bwd) { - DT *C_softmax = static_cast
(m->qk_prods_softmax); - size_t activation_size_needed = - sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; - if (activation_size_needed > m->allocated_peft_buffer_size2) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->softmax_activation_buffer = - allocator->allocate_instance_untyped(activation_size_needed); - m->allocated_peft_buffer_size2 = activation_size_needed; + if (m->rotary_embedding_meta->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(apply_rotary_embedding_bwd), + GET_BLOCKS(parallelism), + min(CUDA_NUM_THREADS, parallelism), + 0, + stream, + A, + m->complex_input, + m->token_infos, + m->rotary_embedding_meta->rope_theta, + (m->rotary_embedding_meta->rope_type == "llama3"), + m->rotary_embedding_meta->factor, + m->rotary_embedding_meta->low_freq_factor, + m->rotary_embedding_meta->high_freq_factor, + m->rotary_embedding_meta->original_max_position_embeddings, + m->qProjSize, + num_tokens, + m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + if (m->inference_debugging) { + std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); } - checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer, - C_softmax, - sizeof(DT) * total_tokens * num_new_tokens * - m->num_q_heads, - hipMemcpyDeviceToDevice, - stream)); } - // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ - // softmax(QK.T/sqrt(d_k)).T + + // Step 8: compute gradients w.r.t. input { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_ = m->vProjSize; - int n = num_new_tokens; - int k = total_tokens; - // before transpositions - int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - // N.B. strides are applied before transpose operations - int strideA = vt_block_size; - int strideB = num_new_tokens * total_tokens; - int strideC = m->vProjSize; - // matrix A: value cache - // matrix A's layout: [vProjSize, num_heads, total_tokens] - // To get A, skip over V.T entries from previous requests (all heads + - // padding) - DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; - // matrix B: qk_prods_softmax - // matrix B's layout: [num_new_tokens, total_tokens, num_heads] - // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous - // requests (all heads) - DT *B = static_cast
(m->qk_prods_softmax); - // matrix C: attn heads - // matrix C's layout: [vProjSize, num_heads, num_new_tokens] - // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous - // requests - // store the result attn heads, also skip the genration tokens - DT *C = static_cast
(m->attn_heads) + - (bc->requestsInfo[i].first_token_offset_in_batch) * - m->num_q_heads * m->vProjSize; - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + float alpha = 1.0f, beta = 0.0f; + if (!m->reset_input_grads[0]) { + beta = 1.0f; + } + // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) + // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] + DT const *B = static_cast
(m->devQKVProjArray); + // matrix C: gradients w.r.t. input + // matrix C's layout: [m->qSize, num_tokens] + DT *C = input_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; + // int m_ = m->qSize; + int n_ = num_tokens; + int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + + // The original version uses existing result and attention's projection to + // do further calculation in a way different than the usual dense layer, + // they are off by a transpose. So an explicit transpose is needed here. + // The add here is just for gradient accumulation. + transposeAdd(C, B, n_, k_, alpha, beta, stream); + + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; + save_tensor(C, num_tokens * m->qSize, filename.c_str()); + } } - tokens_previous_requests += num_new_tokens; - } - if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { - bc->print(); - printf("tokens_previous_requests: %i\n", tokens_previous_requests); - printf("num_tokens: %i\n", num_tokens); - printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); } - assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } +} // namespace IncMultiHeadAttention +} // namespace Kernels + +using namespace Kernels::IncMultiHeadAttention; + /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -1785,43 +1535,14 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } - // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_half_ptr(), - m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream); } else if (input.data_type == DT_FLOAT) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_float_ptr(), - m->offload ? static_cast(m->weight_ptr) - : weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream); } else { assert(false && "Unspported data type"); } @@ -1843,12 +1564,9 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( BatchConfig const *bc, int shard_id, GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &weight, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorR const &output_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -1857,35 +1575,23 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } - // assert(input.data_type == weight.data_type); assert(input_grad.data_type == output_grad.data_type); - if (use_bias) { - assert(input_grad.data_type == bias.data_type); - } if (input_grad.data_type == DT_HALF) { assert(!m->offload); - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, bc, shard_id, input_grad.get_half_ptr(), - weight.get_half_ptr(), output_grad.get_half_ptr(), - bias_ptr, stream); } else if (input_grad.data_type == DT_FLOAT) { assert(!m->offload); - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, bc, shard_id, input_grad.get_float_ptr(), - weight.get_float_ptr(), output_grad.get_float_ptr(), - bias_ptr, stream); } else { assert(false && "Unspported data type"); @@ -1904,7 +1610,6 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, @@ -1919,14 +1624,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, + attn->rotary_embedding_meta, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->final_bias, attn->scaling_factor, - weight, gpu_mem_allocator, num_samples, attn->num_q_heads, @@ -1947,14 +1649,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _kProjSize, int _vProjSize, int _oProjSize, - bool _apply_rotary_embedding, - bool _qkv_bias, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, bool _qk_prod_scaling, bool _position_bias, - bool _final_bias, float _scaling_factor, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _global_num_q_heads, @@ -1963,7 +1662,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _num_kv_heads, DataType _quantization_type, bool _offload) - : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) { + : OpMeta(handler, attn) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(miopenSetStream(handler.dnn, stream)); @@ -1989,29 +1688,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( num_kv_heads = _num_kv_heads; hidden_size = num_q_heads * qProjSize; - weightSize = - ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * - num_q_heads + - (kSize * kProjSize + vSize * vProjSize) * num_q_heads) * - size_of_dt; - if (quantization_type != DT_NONE) { - quantized_weightSize = get_quantization_to_byte_size( - attn->data_type, quantization_type, weightSize); - } - // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; - - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int final_bias_size = oProjSize; - biasSize = - (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0); - - // has_load_weights = (bool *)calloc(1, sizeof(bool)); - //*has_load_weights = false; - apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); - *apply_rotary_embedding = _apply_rotary_embedding; - qkv_bias = (bool *)calloc(1, sizeof(bool)); - *qkv_bias = _qkv_bias; + rotary_embedding_meta = + (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta)); + *rotary_embedding_meta = _rotary_embedding_meta; scaling_query = (bool *)calloc(1, sizeof(bool)); *scaling_query = _scaling_query; scaling_factor = _scaling_factor; @@ -2019,14 +1698,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( *qk_prod_scaling = _qk_prod_scaling; position_bias = (bool *)calloc(1, sizeof(bool)); *position_bias = _position_bias; - final_bias = (bool *)calloc(1, sizeof(bool)); - *final_bias = _final_bias; - - // allocate weight and bias in the reserve space for cpu offloading - if (offload) { - weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(weightSize); - bias_ptr = gpu_mem_allocator.allocate_reserved_untyped(biasSize); - } // allocate memory for the seqArray and reserve space { @@ -2092,9 +1763,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( ? key_cache_size + value_cache_size + qkv_max_proj_size : key_cache_size + value_cache_size); - if (quantization_type != DT_NONE) { - totalSharedSize += quantized_weightSize; - } assert(gpu_mem_allocator.reserved_total_size - gpu_mem_allocator.reserved_allocated_size >= totalSharedSize); @@ -2125,29 +1793,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( handler.batch_config_metadata->requestsInfo); if (offload) { - // token_infos = - // gpu_mem_allocator.allocate_reserved( - // tokeninfo_size); - // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); - // offset += qk_prod_size * size_of_dt; qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped( qk_prod_size * size_of_dt); - // offset += qk_prod_size * size_of_dt; attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size * size_of_dt); - // offset += attn_heads_size * size_of_dt; complex_input = gpu_mem_allocator.allocate_reserved(complex_size); - // offset += complex_size * sizeof(hipFloatComplex); - // request_infos = - // gpu_mem_allocator.allocate_reserved( - // requestinfo_size); } else { - // token_infos = - // gpu_mem_allocator.allocate_instance( - // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -2156,16 +1810,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); - // request_infos = - // gpu_mem_allocator.allocate_instance( - // requestinfo_size); } // allocate more size for quantization data if (quantization_type != DT_NONE) { assert(offload); - quantized_weight_ptr = - gpu_mem_allocator.allocate_reserved(quantized_weightSize); } if (!offload) { assert(gpu_mem_allocator.reserved_total_size == @@ -2183,49 +1832,32 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { } } -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( - IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - hipStream_t stream); +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + hipStream_t stream); -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( - IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - hipStream_t stream); +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + hipStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( +template void Kernels::IncMultiHeadAttention::compute_qkv_kernel( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, float *output_ptr, - float const *weight_ptr, - float const *bias_ptr, - int num_tokens, hipStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( +template void Kernels::IncMultiHeadAttention::compute_qkv_kernel( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, half *output_ptr, - half const *weight_ptr, - half const *bias_ptr, - int num_tokens, hipStream_t stream); -template void - Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - float *output_ptr, - hipStream_t stream); - -template void - Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - half *output_ptr, - hipStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index b278611b60..2802dd41b6 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -19,6 +19,7 @@ #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/utils/cuda_helper.h" +#include namespace FlexFlow { @@ -31,1075 +32,162 @@ using Legion::Memory; namespace Kernels { namespace IncMultiHeadAttention { -// gridDim = num_heads -// blockDim = num_tokens/num_request * head_size -// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads| -// one thread process one head_size -template -__global__ void compute_attention_kernel_generation_kernel( - DT const *query, - DT const *key_cache, - DT const *value_cache, - DT *output_ptr, - float const scale, - int max_seq_length, - int per_head_size, - int hidden_size, - BatchConfig::PerRequestInfo *request_infos) { - - // q, k - using Q_vec = typename VEC_K::Type; - using K_vec = typename VEC_K::Type; - using V_vec = typename VEC_V
::Type; - using Out_sum = typename Vec_fp32_::Type; - - constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; - - // eg. if head_size = 128, thread_per_key = 4, with float32 precision - // then K_VEC_SIZE = 1, QK_VEC_SIZE = 4 - // K_ELTS_PER_THREAD = 128 / 4 = 32 - // K_VECS_PER_THREAD = 32 / 1 = 32 - constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); - // constexpr int QK_VEC_SIZE = 16 / sizeof(DT); - // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT); - constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; - constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; - // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); - - // thread id - int const tidx = threadIdx.x; - // head id - int const head_idx = blockIdx.x; - // request idx - int const request_idx = blockIdx.y; - - int const batch_config_request_id = - request_infos[request_idx].batch_config_request_id; - - int const first_step = 0; +template +__global__ void store_kv_cache(DT const *devQKVProjArray, + DT *kCache_ptr, + DT *vCache_ptr, + BatchConfig::PerTokenInfo const *tokenInfos, + int num_tokens, + int max_seq_len, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; - int const tlength = - request_infos[batch_config_request_id].first_token_depth_in_request + - request_infos[batch_config_request_id].num_tokens_in_batch; + size_t val_idx = + token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; - // shared memory objects - extern __shared__ char smem_[]; + DT kVal = devQKVProjArray[val_idx]; + DT vVal = devQKVProjArray[val_idx + hidden_size]; + int const req_id = tokenInfos[token_idx].request_index; + int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - float *qk_smem = reinterpret_cast(smem_); - float *out_smem = reinterpret_cast(smem_); + // key cache + kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + + offset] = vVal; + } +} - float qk_max = -FLT_MAX; +template +__global__ void store_query_cache(DT const *devQKVProjArray, + DT *qCache_ptr, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + int offset = i % hidden_size; - // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum - __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; - const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + - head_idx * per_head_size; - __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; - // DT const *q_ptr = - // query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size; + DT qVal = devQKVProjArray[val_idx]; - // q tensor in this thread - // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total - // K_VECS_PER_THREAD elements - // QK_vec_k: 32->1, 64->2, 128->4... head_size - // K_vec_k: 4->1, 2->2, 1->4 threads_per_key + // query cache + qCache_ptr[i] = qVal; + } +} - // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE - int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; - int ki_o = tidx % THREADS_PER_KEY; - // the first key's offset for this thread - // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... - int ko = tidx / THREADS_PER_KEY; - // load q tensor - Q_vec q_vec[K_VECS_PER_THREAD]; -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - q_vecs[ki_o][ii] = *reinterpret_cast( - q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); +template +__global__ void fill_entries_above_diagonal(DT *matrix, + size_t num_rows, + size_t num_cols, + size_t num_q_heads, + size_t entries_above_diagonal, + DT value) { + CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { + size_t head_idx = i / entries_above_diagonal; + size_t entry_idx = i % entries_above_diagonal; + size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; + size_t x = entry_idx - y * (y + 1) / 2; + y += (num_cols - num_rows) + 1; + matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; } - __syncthreads(); - // first iter = 128 / 4 = 32 - // K_VECS_PER_THREAD = 32 - // K_PER_ITER how many keys in this loop - // The number of timesteps loaded per iteration. - constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; - // // The number of keys per warp. - constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; +} - DT const *k_cache_batch = - key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; +template +void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + cudaStream_t stream) { + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); + cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); + assert(data_type_size(m->output_type[0]) == sizeof(DT)); + cudaDataType_t compute_type = cublas_data_type; - int ti_end = - div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; - // get k, perform qk proj + int num_tokens = bc->num_active_tokens(); + int tokens_previous_requests = 0; + int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize); - for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { - K_vec k[K_VECS_PER_THREAD]; - int const ti_circ = ti % max_seq_length; -#pragma unroll - for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { - int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; - if (ti < tlength) { - k[ii] = *reinterpret_cast(k_cache_batch + - ti_circ * hidden_size + - head_idx * per_head_size + jj); - } - // Compute dot product. - // This includes a reduction across the threads in the same thread group. + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || + (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { + continue; } - float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); - // // todo add positional embedding to the qk production - // // Store the product to shared memory. There's one qk value per - // timestep. - // // Update the max. - if (ti < tlength && tidx % THREADS_PER_KEY == 0) { - // todo add alobi here - bool const mask = ti_circ >= tlength; - if (mask) { - assert(false); + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + // Copy query to m->query_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; + if (activation_size_needed > m->allocated_peft_buffer_size1) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->query_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size1 = activation_size_needed; } - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - qk_smem[ti - first_step] = mask ? 0.f : qk; + int parallelism = m->hidden_size * num_tokens; + store_query_cache<<>>( + static_cast
(m->devQKVProjArray), + static_cast
(m->query_activation_buffer), + num_tokens, + m->hidden_size); } - } - - __syncthreads(); - -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Decompose the thread index into warp and lane. - int const warp = tidx / WARP_SIZE; - int const lane = tidx % WARP_SIZE; + // Step 1: compute query-key product QK.T/sqrt(d_k) + { + // Scale by sqrt(d_k) as per the original attention paper + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // after transpositions + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + // before transpositions + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + // N.B. strides are applied before transpose operations + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; - // The warp leader writes the max to shared memory. - if (lane == 0) { - red_smem[warp] = qk_max; - } - - // Make sure the products are in shared memory. - __syncthreads(); - - // The warps finalize the reduction. - qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); - } - - // Broadcast to all the threads in the warp. - qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); - - float exp_sum = 0.f; - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - float logit = __expf(qk_smem[ti - first_step] - qk_max); - exp_sum += logit; - qk_smem[ti - first_step] = logit; - } - - // Compute the sum. - exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); - - // softmax - float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); - for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { - qk_smem[ti - first_step] *= inv_sum; - } - - __syncthreads(); - // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { - // printf("softmax %.10f\n", qk_smem[0]); - // } - - // value projection - constexpr int V_VEC_SIZE = 16 / sizeof(DT); - // A vector of V elements for the current timestep. - // using V_vec_k = typename V_vec_k_::Type; - // using V_vec_acum = typename V_vec_acum_fp32_::Type; - - // The value computed by this thread. - int vo = tidx / THREADS_PER_VALUE; - // The hidden dimensions computed by this particular thread. - int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; - constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; - - Out_sum out; - zero(out); - - // The base pointer for the value in the cache buffer. - DT const *v_cache_batch = - value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; - - if (Dh == Dh_MAX || vi < Dh) { - for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { - // Load the values from the cache. - int const ti_circ = ti % max_seq_length; - - V_vec v = *reinterpret_cast( - v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); - float logit = qk_smem[ti - first_step]; - out = FlexFlow::fma(logit, cast_to_float(v), out); - } - } - - // // Make sure we can start writing to shared memory. - __syncthreads(); - - // Run the final reduction amongst the different groups computing different - // partial outputs. - if (Dh == Dh_MAX || vi < Dh) { -#pragma unroll - for (int active_groups = V_PER_ITER; active_groups >= 2; - active_groups /= 2) { - - // The midpoint in the number of active groups. - int midpoint = active_groups / 2; - - // The upper part of active threads store to shared memory. - if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { - *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = - out; - } - __syncthreads(); - - // The bottom warps update their values. - if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { - out = add(*reinterpret_cast(out_smem + vo * Dh + vi), - out); - } - __syncthreads(); - } - } - - // Output the final values. - if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { - convert_from_float( - *reinterpret_cast(output_ptr + request_idx * hidden_size + - head_idx * per_head_size + vi), - out); - } -} - -// only used by MPT model. https://arxiv.org/abs/2108.12409 -template -__global__ void apply_position_bias_qkprd(DT *input_ptr, - int num_tokens, - int num_total_tokens, - int num_heads, - int global_num_q_heads, - int shard_id) { - CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) { - // get head_idx, - int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id); - int position_idx = (i / num_tokens) % num_total_tokens; - position_idx = position_idx + 1 - num_total_tokens; - // 8 is alibi_bias_max in - // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json - float base = (float)(head_idx + 1) * 8 / global_num_q_heads; - float slopes = 1.0 / pow(2, base); - // if(i == 0){ - // printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes, - // position_idx * slopes); - // } - input_ptr[i] += static_cast
(position_idx * slopes); - } -} - -template -__global__ void apply_proj_bias_w(DT *input_ptr, - DT const *bias_ptr, - int num_tokens, - int qkv_weight_size, - int oProjSize) { - CUDA_KERNEL_LOOP(i, num_tokens * oProjSize) { - int bias_idx = qkv_weight_size + i % oProjSize; - input_ptr[i] += bias_ptr[bias_idx]; - } -} - -template -__global__ void apply_proj_bias_qkv(DT *input_ptr, - DT const *bias_ptr, - int shard_id, - int num_tokens, - int qProjSize, - int kProjSize, - int vProjSize, - int global_num_q_heads, - int num_q_heads, - bool scaling_query, - float scaling_factor, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * QKV_WEIGHT_NUM) { - // for simplicity, assume q, k, v is in same shape - // 0->q, 1->k, 2->v - // int qkv_index = i / (num_tokens * qProjSize) % 3; - - int token_idx = i / (hidden_size * QKV_WEIGHT_NUM); - size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM; - - int qkv_index = in_token_idx / hidden_size; - - int proj_size = qkv_index == 0 ? qProjSize : kProjSize; - - int head_idx = - (in_token_idx - qkv_index * num_q_heads * proj_size) / proj_size; - int global_head_idx = head_idx + shard_id * num_q_heads; - - size_t pre_length = - qkv_index == 0 - ? 0 - : (qkv_index == 1 ? qProjSize * global_num_q_heads - : qProjSize * global_num_q_heads * KV_WEIGHT_NUM); - - size_t bias_idx = pre_length + global_head_idx * proj_size + i % proj_size; - - input_ptr[i] += bias_ptr[bias_idx]; - - if (scaling_query && qkv_index == 0) { - input_ptr[i] *= scaling_factor; - } - } -} - -template -__global__ void scaling_query_kernel(DT *input_ptr, - int qProjSize, - int num_tokens, - int num_q_heads, - float scaling_factor, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *= - scaling_factor; - } -} - -template -__global__ void - apply_rotary_embedding_native(DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_q_heads, - int num_tokens, - int num_kv_heads, - int q_block_size, - int k_block_size, - int q_array_size) { - CUDA_KERNEL_LOOP( - i, - num_tokens * (qProjSize * num_q_heads + kProjSize * num_kv_heads) / 2) { - // create complex number - bool q_tensor = i < (q_array_size / 2); - int proj_size = q_tensor ? qProjSize : kProjSize; - int real_i = q_tensor ? i : i - q_array_size / 2; - - int head_idx = real_i / (num_tokens * proj_size / 2); - int idx = real_i % (num_tokens * proj_size / 2); - int real_part_index = idx * 2 + - head_idx * (q_tensor ? q_block_size : k_block_size) + - (q_tensor ? 0 : q_array_size); - - int complex_part_index = real_part_index + 1; - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - int token_idx = - (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2); - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - // complex_input[i].y; - - int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - -template -__global__ void - apply_rotary_embedding_hf(DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int qProjSize, - int kProjSize, - int num_tokens, - size_t q_array_size, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - // create complex number - bool q_tensor = i < (q_array_size / 2); - int proj_size = q_tensor ? qProjSize : kProjSize; - int real_i = q_tensor ? i : i - q_array_size / 2; - - int token_idx = real_i / (hidden_size / 2); - int idx = real_i % (proj_size / 2); - int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); - - int real_part_index = idx + head_idx * proj_size + - token_idx * hidden_size * QKV_WEIGHT_NUM + - hidden_size * (q_tensor ? 0 : 1); - int complex_part_index = real_part_index + (proj_size / 2); - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 - // apply a Cartesian coordinate transformation - // multiple with input & /copy back to q/k - - // get position of token - - // size_t pos = id_map[token_idx].token_position; - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - // float before_real = complex_input[i].x, before_complex = - int pos_i = real_i % (proj_size / 2); - float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - -template -__global__ void - apply_rotary_embedding_bwd(DT *input_ptr, - cuFloatComplex *complex_input, - BatchConfig::PerTokenInfo const *tokenInfos, - int proj_size, - int num_tokens, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - // compute indexes to visit first half proj_size of each of q/k tensor. - // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd - bool q_tensor = i < (num_tokens * hidden_size / 2); - int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; - assert(hidden_size % proj_size == 0); - int num_heads = hidden_size / proj_size; - - int token_idx = real_i % num_tokens; - int idx = (real_i / num_tokens) % (proj_size / 2); - int head_idx = real_i / (num_tokens * proj_size / 2); - assert(head_idx < num_heads); - - int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size + - head_idx * num_tokens * proj_size + - idx * num_tokens + token_idx; - int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; - - complex_input[i] = {input_ptr[real_part_index], - input_ptr[complex_part_index]}; - - size_t pos = tokenInfos[token_idx].abs_depth_in_request; - - float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size)); - cuFloatComplex complex_pos = {cos(freq), sin(freq)}; - - complex_input[i] = cuCmulf(complex_input[i], complex_pos); - input_ptr[real_part_index] = complex_input[i].x; - input_ptr[complex_part_index] = complex_input[i].y; - } -} - -template -__global__ void fill_entries_above_diagonal(DT *matrix, - size_t num_rows, - size_t num_cols, - size_t num_q_heads, - size_t entries_above_diagonal, - DT value) { - CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) { - size_t head_idx = i / entries_above_diagonal; - size_t entry_idx = i % entries_above_diagonal; - size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2; - size_t x = entry_idx - y * (y + 1) / 2; - y += (num_cols - num_rows) + 1; - matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value; - } -} - -template -void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - cudaStream_t stream) { - - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - assert(m->qSize == m->vSize && m->qSize == m->kSize); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudaDataType_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - - // Step 1: Compute QKV projections - { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_q = m->qProjSize * m->num_q_heads; - int m_k = m->kProjSize * m->num_q_heads; - int m_v = m->vProjSize * m->num_q_heads; - assert(m_q == m_k && m_k == m_v); // keep things simple for now - int n = bc->num_active_infr_tokens(); - int k = m->qSize; - int m_ = m_q * QKV_WEIGHT_NUM; - // before transpositions - int lda = k, ldb = k, ldc = m_; - // matrix A: QKV weights - // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3] - // matrix B: input - // matrix B's layout: [qSize (hidden_dim), num_new_tokens] - // matrix C: devQKVProjArray - // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens] - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - weight_ptr, - cublas_data_type, - lda, - input_ptr, - cublas_data_type, - ldb, - &beta, - output_ptr, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - - int num_tokens = bc->num_active_tokens(); - int parallelism = m->kProjSize * num_tokens * m->num_q_heads; - size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; - - // Step 2: apply bias for QKV, or scale the query - if (*m->qkv_bias) { - apply_proj_bias_qkv<<>>(output_ptr, - bias_ptr, - shard_id, - num_tokens, - m->qProjSize, - m->kProjSize, - m->vProjSize, - m->global_num_q_heads, - m->num_q_heads, - *m->scaling_query, - m->scaling_factor, - m->hidden_size); - } else if (m->scaling_query) { - scaling_query_kernel<<>>(output_ptr, - num_tokens, - m->num_q_heads, - m->qProjSize, - m->scaling_factor, - m->hidden_size); - } - - // Step 3: apply rotary embedding if needed - if (*m->apply_rotary_embedding) { - /*q&k*/ - parallelism = num_tokens * m->hidden_size; - apply_rotary_embedding_hf<<>>(output_ptr, - m->complex_input, - m->token_infos, - m->qProjSize, - m->kProjSize, - num_tokens, - q_array_size, - m->hidden_size); - } -} - -template -void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - cudaStream_t stream) { - int num_tokens = bc->num_active_infr_tokens(); - if (num_tokens > 0) { - int parallelism = m->hidden_size * num_tokens; - store_kv_cache<<>>(static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - num_tokens, - BatchConfig::max_sequence_length(), - m->hidden_size); - } -} - -template -void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *weight_ptr, - DT const *bias_ptr, - int num_tokens, - cudaStream_t stream) { - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); -#if CUDA_VERSION >= 11000 - // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance - cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; -#else - cudaDataType_t compute_type = cublas_data_type; -#endif - // Project to output, save result directly on output tensor - { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = num_tokens; - // before transpositions - int lda = k, ldb = k, ldc = m_; - // matrix A: output projection weight - // matrix A's layout: [vProjSize * num_heads, oProjSize] - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - // matrix B: attn heads - // matrix B's layout: [vProjSize * num_heads, num_new_tokens] - DT const *B = static_cast
(m->attn_heads); - // matrix B: output - // matrix B's layout: [oProjSize, num_new_tokens] - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - // Add final output bias - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - apply_proj_bias_w<<>>( - output_ptr, bias_ptr, num_tokens, qkv_weight_size, m->oProjSize); - } -} - -#define LAUNCH_ATTENTION_SCORE_KERNEL( \ - DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ - smem_sz = smem_size_in_bytes
(m->qProjSize, \ - BatchConfig::max_sequence_length(), \ - THREADS_PER_VALUE, \ - THDS_PER_BLOCK); \ - compute_attention_kernel_generation_kernel \ - <<>>( \ - static_cast
(m->devQKVProjArray), \ - static_cast
(m->keyCache), \ - static_cast
(m->valueCache), \ - output_ptr, \ - scale, \ - BatchConfig::max_sequence_length(), \ - m->qProjSize, \ - m->hidden_size, \ - m->request_infos) - -template -void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - DT *output_ptr, - cudaStream_t stream) { - dim3 grid(m->num_q_heads, bc->num_generation_tokens); - int const per_head_size = m->qProjSize; - float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; - size_t smem_sz; - if (per_head_size == 64) { - constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; - LAUNCH_ATTENTION_SCORE_KERNEL( - DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); - } else if (per_head_size == 128) { - constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; - LAUNCH_ATTENTION_SCORE_KERNEL( - DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); - } else { - assert(false && "a unsupported head size"); - } -} - -template -void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - cudaStream_t stream) { - // additional processing for weight uploading - // Note that we update weight_ptr and bias_ptr when uploading weight and - // bias - if (m->quantization_type != DT_NONE) { - // copy weight_ptr to quantized_weight_ptr, do compression and store in - // m->weight_ptr - cudaMemcpyAsync(m->quantized_weight_ptr, - weight.get_byte_ptr(), - m->quantized_weightSize, - cudaMemcpyHostToDevice, - stream); - - if (m->quantization_type == DT_INT4) { - int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2; - decompress_int4_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); - } else { - assert(m->quantization_type == DT_INT8); - int parallelism = m->qProjSize * m->qSize * m->num_q_heads; - decompress_int8_attention_weights<<>>( - m->quantized_weight_ptr, - static_cast
(m->weight_ptr), - m->qProjSize, - m->qSize, - m->num_q_heads); - } - } else { - if (data_type == DT_FLOAT) { - cudaMemcpyAsync(m->weight_ptr, - weight.get_float_ptr(), - m->weightSize, - cudaMemcpyHostToDevice, - stream); - } else if (data_type == DT_HALF) { - cudaMemcpyAsync(m->weight_ptr, - weight.get_half_ptr(), - m->weightSize, - cudaMemcpyHostToDevice, - stream); - } else { - assert(false); - } - } -} - -template -void inference_kernel(IncMultiHeadSelfAttentionMeta *m, - BatchConfig const *bc, - int shard_id, - DT const *input_ptr, - DT const *weight_ptr, - DT *output_ptr, - DT const *bias_ptr, - cudaStream_t stream) { - - if (m->offload && m->biasSize > 0) { - cudaMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); - bias_ptr = static_cast
(m->bias_ptr); - } - - // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); - update_kv_cache_kernel
(m, bc, stream); - - if (bc->num_generation_tokens > 0) { - // phase 3: Compute attention score for generation tokens - compute_attention_kernel_generation
( - m, bc, static_cast
(m->attn_heads), stream); - } - - if (bc->num_tokens > bc->num_generation_tokens) { - // phase 4: Compute attention score for prompt tokens; - compute_attention_kernel_prompt( - m, bc, shard_id, bias_ptr, weight_ptr, stream); - } - - // compute output production and bias together for all tokens - int num_tokens = bc->num_active_tokens(); - compute_o_prod_bias( - m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); -} - -std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, - int shard_id) { - std::string op_name_without_uid = - IncMultiHeadSelfAttention::get_op_name_without_uid(m); - fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); - if (m->layer_guid.model_id > 0) { - assert(false && "Model ID > 0 not supported yet"); - } - std::string layername = "layers." + - std::to_string(m->layer_guid.transformer_layer_id) + - "." + op_name_without_uid; - dst_filepath /= layername; - return dst_filepath.string(); -} - -template -void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - int shard_id, - DT *input_grad_ptr, - DT const *weight_ptr, - DT const *output_grad_ptr, - DT const *bias_ptr, - cudaStream_t stream) { - assert(!m->offload); - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); - cudaDataType_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - if (!bc->requestsInfo[i].peft_bwd) { - continue; - } - int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - // Currently assume we are calculating gradients for all tokens - // of a request - assert(num_tokens == num_total_tokens); - int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); - // Step 1: compute gradients before final projection - { - int m_ = m->vProjSize * m->num_q_heads; - int n_ = num_tokens; - int k_ = m->oProjSize; - int lda = m_; - int ldb = k_; - int ldc = m_; - float alpha = 1.0f, beta = 0.0f; - // matrix A: output projection weight - // matrix A's layout: [vProjSize * num_heads, oProjSize] - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - // matrix B: output gradients - // matrix B's layout: [oProjSize, num_new_tokens] - DT const *B = - output_grad_ptr + - bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize; - // matrix C: attn_heads gradients - // matrix C's layout: [vProjSize * num_heads, num_new_tokens] - DT *C = static_cast
(m->handle.workSpace); - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_N, - m_, - n_, - k_, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - // save result to file for checking - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0"; - save_tensor(C, m_ * n_, filename.c_str()); - } - } - // Step 2: compute gradients w.r.t. value - { - float alpha = 1.0f, beta = 0.0f; - // matrix A: qk_prods_softmax - // matrix A's layout: [num_new_tokens, total_tokens, num_heads] - DT const *A = static_cast
(m->qk_prods_softmax); - // matrix B: attn_heads gradients - // matrix B's layout: [vProjSize * num_heads, num_new_tokens] - DT const *B = static_cast
(m->handle.workSpace); - // matrix C: gradients for value (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray) + - 2 * num_tokens * - (m->qProjSize * m->num_q_heads); // skip over regions reserved - // for Q and K gradients - // after transpositions - int m_ = num_tokens; // total_tokens - int n_ = m->vProjSize; // num_new_tokens - int k_ = num_tokens; // num_new_tokens - // before transpositions - int lda = num_tokens; // num_new_tokens - int ldb = m->vProjSize * m->num_q_heads; - int ldc = num_tokens; // total_tokens - // N.B. strides are applied before transpose operations - int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens - int strideB = m->vProjSize; - int strideC = num_tokens * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_T, - m_, - n_, - k_, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // save result to file for checking - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0"; - save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str()); - std::string filename2 = - get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax"; - save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str()); - } - } - // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor - { - float alpha = 1.0f, beta = 0.0f; - // matrix A: attn_heads gradients - // matrix A's layout: [vProjSize * num_heads, num_new_tokens] - DT const *A = static_cast
(m->handle.workSpace); - // matrix B: value cache - // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] - DT const *B = static_cast
(m->valueCache) + i * vt_req_block_size; - // matrix C: qk_prods_softmax gradients + // matrix A: devQKVProjArray + // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] + // To get query projection, skip over Q entries from previous requests + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + // matrix B: key cache + // matrix B's layout: [kProjSize * num_heads, total_tokens] + // To get B, skip over K entries from previous requests (all heads + + // padding) + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: qk_prods // matrix C's layout: [num_new_tokens, total_tokens, num_heads] - DT *C = static_cast
(m->qk_prods_softmax); - // after transposition & striding - int m_ = num_tokens; // num_new_tokens - int n_ = num_tokens; - int k_ = m->vProjSize; - // before transposition and striding - int lda = m->vProjSize * m->num_q_heads; - int ldb = m->vProjSize * m->num_q_heads; - int ldc = num_tokens; // num_new_tokens - int strideA = m->vProjSize; - int strideB = m->vProjSize; - int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens - + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, m_, - n_, - k_, + n, + k, &alpha, A, cublas_data_type, @@ -1117,23 +205,57 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad"; - save_tensor( - C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); - std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache"; - save_tensor( - B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str()); - } } - // Step 4: softmax backpropagation + // Step 2: Add alibi position bias to qk production + // matrix C: qk_prods + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + // To get C, skip over QK.T products from previous requests + DT *C = static_cast
(m->qk_prods); + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + apply_position_bias_qkprd<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } + + // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods + // with -inf to force causal attention. + assert(num_new_tokens <= total_tokens); + size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + fill_entries_above_diagonal<<>>(C, + num_new_tokens, + total_tokens, + m->num_q_heads, + entries_above_diagonal, + static_cast
(-INFINITY)); + } + + // Step 4: Compute Softmax(QK.T/sqrt(d_k)) { - float alpha = 1.0f, beta = 0.0f; + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. int n_param = m->num_q_heads; - int c_param = num_tokens; + int c_param = total_tokens; int h_param = 1; - int w_param = num_tokens; + int w_param = num_new_tokens; checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, @@ -1141,85 +263,79 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, c_param, h_param, w_param)); - checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - m->qk_tensor, - m->softmax_activation_buffer, - m->qk_tensor, - m->qk_prods_softmax, - &beta, - m->qk_tensor, - m->qk_prods)); - - if (m->inference_debugging) { - DT *C = static_cast
(m->qk_prods); - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in"; - save_tensor( - C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); - } - - // TODO: fill all elements above diagonal to force causal attention - size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; - if (entries_above_diagonal > 0) { - size_t parallelism = m->num_q_heads * entries_above_diagonal; - fill_entries_above_diagonal<<>>(static_cast
(m->qk_prods), - num_tokens, - num_tokens, - m->num_q_heads, - entries_above_diagonal, - DT(0.0f)); - } - if (m->inference_debugging) { - DT *C = static_cast
(m->qk_prods); - std::string filename = get_peft_dbg_folder(m, shard_id) + - ".qk_prods.softmax_grad_in.masked"; - save_tensor( - C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax); + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax)); + } + // Copy C_softmax to m->softmax_activation_buffer if we need to compute + // PEFT backward + if (bc->requestsInfo[i].peft_bwd) { + DT *C_softmax = static_cast
(m->qk_prods_softmax); + size_t activation_size_needed = + sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; + if (activation_size_needed > m->allocated_peft_buffer_size2) { + MemoryAllocator *allocator = m->handle.peft_activation_allocator; + m->softmax_activation_buffer = + allocator->allocate_instance_untyped(activation_size_needed); + m->allocated_peft_buffer_size2 = activation_size_needed; } + checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer, + C_softmax, + sizeof(DT) * total_tokens * num_new_tokens * + m->num_q_heads, + cudaMemcpyDeviceToDevice, + stream)); } - // Step 5: compute gradients w.r.t. key + // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ + // softmax(QK.T/sqrt(d_k)).T { - float alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = 1.0f / sqrt(m->kProjSize); - } - // matrix A: gradients w.r.t. qk_prods - // matrix A's layout: [num_new_tokens, num_tokens, num_heads] - DT const *A = static_cast
(m->qk_prods); - // matrix B: query activation (in query_activation_buffer) - // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens] - DT const *B = static_cast
(m->query_activation_buffer); - // matrix C: gradients for key (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = - static_cast
(m->devQKVProjArray) + - num_tokens * - (m->qProjSize * - m->num_q_heads); // skip over regions reserved for Q gradients - // after transposition & striding - int m_ = num_tokens; - int n_ = m->kProjSize; - int k_ = num_tokens; // num_new_tokens - // before transposition and striding - int lda = num_tokens; // num_new_tokens - int ldb = m->kProjSize * m->num_q_heads; - int ldc = num_tokens; - int strideA = num_tokens * num_tokens; - int strideB = m->kProjSize; - int strideC = num_tokens * m->kProjSize; + DT alpha = 1.0f, beta = 0.0f; + // after transpositions + int m_ = m->vProjSize; + int n = num_new_tokens; + int k = total_tokens; + // before transpositions + int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + // N.B. strides are applied before transpose operations + int strideA = vt_block_size; + int strideB = num_new_tokens * total_tokens; + int strideC = m->vProjSize; + // matrix A: value cache + // matrix A's layout: [vProjSize, num_heads, total_tokens] + // To get A, skip over V.T entries from previous requests (all heads + + // padding) + DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix B: qk_prods_softmax + // matrix B's layout: [num_new_tokens, total_tokens, num_heads] + // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous + // requests (all heads) + DT *B = static_cast
(m->qk_prods_softmax); + // matrix C: attn heads + // matrix C's layout: [vProjSize, num_heads, num_new_tokens] + // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous + // requests + // store the result attn heads, also skip the genration tokens + DT *C = static_cast
(m->attn_heads) + + (bc->requestsInfo[i].first_token_offset_in_batch) * + m->num_q_heads * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, + CUBLAS_OP_N, CUBLAS_OP_T, m_, - n_, - k_, + n, + k, &alpha, A, cublas_data_type, @@ -1237,323 +353,797 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".query_activation"; - save_tensor( - B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str()); - std::string filename2 = - get_peft_dbg_folder(m, shard_id) + ".devkproj_pre"; - save_tensor( - C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str()); + } + tokens_previous_requests += num_new_tokens; + } + if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { + bc->print(); + printf("tokens_previous_requests: %i\n", tokens_previous_requests); + printf("num_tokens: %i\n", num_tokens); + printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); + } + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); +} + +// gridDim = num_heads +// blockDim = num_tokens/num_request * head_size +// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads| +// one thread process one head_size +template +__global__ void compute_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + // eg. if head_size = 128, thread_per_key = 4, with float32 precision + // then K_VEC_SIZE = 1, QK_VEC_SIZE = 4 + // K_ELTS_PER_THREAD = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 / 1 = 32 + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + // constexpr int QK_VEC_SIZE = 16 / sizeof(DT); + // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // request idx + int const request_idx = blockIdx.y; + + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + int const first_step = 0; + + int const tlength = + request_infos[batch_config_request_id].first_token_depth_in_request + + request_infos[batch_config_request_id].num_tokens_in_batch; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + // DT const *q_ptr = + // query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size; + + // q tensor in this thread + // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total + // K_VECS_PER_THREAD elements + // QK_vec_k: 32->1, 64->2, 128->4... head_size + // K_vec_k: 4->1, 2->2, 1->4 threads_per_key + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + __syncthreads(); + // first iter = 128 / 4 = 32 + // K_VECS_PER_THREAD = 32 + // K_PER_ITER how many keys in this loop + // The number of timesteps loaded per iteration. + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + // get k, perform qk proj + + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < tlength) { + k[ii] = *reinterpret_cast(k_cache_batch + + ti_circ * hidden_size + + head_idx * per_head_size + jj); + } + // Compute dot product. + // This includes a reduction across the threads in the same thread group. + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + // // todo add positional embedding to the qk production + // // Store the product to shared memory. There's one qk value per + // timestep. + // // Update the max. + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + bool const mask = ti_circ >= tlength; + if (mask) { + assert(false); } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + float logit = __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("softmax %.10f\n", qk_smem[0]); + // } + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + float logit = qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); } - // Step 6: compute gradients w.r.t query - { - float alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = 1.0f / sqrt(m->kProjSize); + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; } - // matrix A: gradients w.r.t. qk_prods - // matrix A's layout: [num_new_tokens, num_tokens, num_heads] - DT const *A = static_cast
(m->qk_prods); - // matrix B: key cache - // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // matrix C: gradients for query (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = static_cast
(m->devQKVProjArray); - // after transposition & striding - int m_ = num_tokens; // num_new_tokens - int n_ = m->qProjSize; - int k_ = num_tokens; - // before transposition and striding - int lda = num_tokens; // num_new_tokens - int ldb = m->qProjSize * m->num_q_heads; - int ldc = num_tokens; - int strideA = num_tokens * num_tokens; - int strideB = m->qProjSize; - int strideC = num_tokens * m->qProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n_, - k_, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; - save_tensor(C, - num_tokens * m->qProjSize * m->num_q_heads * 3, - filename.c_str()); + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); } + __syncthreads(); } + } - // Step 7: perform rotary position embeddings (RoPE) bwd - { - if (*m->apply_rotary_embedding) { - assert(m->hidden_size == m->qProjSize * m->num_q_heads); - assert(m->qProjSize == m->kProjSize); - /*q&k*/ - int parallelism = num_tokens * m->hidden_size; - DT *A = static_cast
(m->devQKVProjArray); - apply_rotary_embedding_bwd<<>>(A, - m->complex_input, - m->token_infos, - m->qProjSize, - num_tokens, - m->hidden_size); - DT *C = static_cast
(m->devQKVProjArray); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; - save_tensor(C, - num_tokens * m->qProjSize * m->num_q_heads * 3, - filename.c_str()); - } - } + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float( + *reinterpret_cast(output_ptr + request_idx * hidden_size + + head_idx * per_head_size + vi), + out); + } +} - // matrix C: gradients for key (saved as part of m->devQKVProjArray) - // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] - DT *C = - static_cast
(m->devQKVProjArray) + - num_tokens * - (m->qProjSize * - m->num_q_heads); // skip over regions reserved for Q gradients - if (m->inference_debugging) { - std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; - save_tensor( - C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); +// only used by MPT model. https://arxiv.org/abs/2108.12409 +template +__global__ void apply_position_bias_qkprd(DT *input_ptr, + int num_tokens, + int num_total_tokens, + int num_heads, + int global_num_q_heads, + int shard_id) { + CUDA_KERNEL_LOOP(i, num_tokens * num_total_tokens * num_heads) { + // get head_idx, + int head_idx = i / (num_tokens * num_total_tokens) + (num_heads * shard_id); + int position_idx = (i / num_tokens) % num_total_tokens; + position_idx = position_idx + 1 - num_total_tokens; + // 8 is alibi_bias_max in + // https://huggingface.co/mosaicml/mpt-30b/blob/main/config.json + float base = (float)(head_idx + 1) * 8 / global_num_q_heads; + float slopes = 1.0 / pow(2, base); + // if(i == 0){ + // printf("see position: %d, %f, %f, %f\n", position_idx, base, slopes, + // position_idx * slopes); + // } + input_ptr[i] += static_cast
(position_idx * slopes); + } +} + +template +__global__ void scaling_query_kernel(DT *input_ptr, + int qProjSize, + int num_tokens, + int num_q_heads, + float scaling_factor, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / hidden_size; + input_ptr[i % hidden_size + token_idx * hidden_size * QKV_WEIGHT_NUM] *= + scaling_factor; + } +} + +template +__global__ void + apply_rotary_embedding_hf(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + float rope_theta, + bool llama3_rope, + float factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, + int qProjSize, + int kProjSize, + int num_tokens, + size_t q_array_size, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + // create complex number + bool q_tensor = i < (q_array_size / 2); + int proj_size = q_tensor ? qProjSize : kProjSize; + int real_i = q_tensor ? i : i - q_array_size / 2; + + int token_idx = real_i / (hidden_size / 2); + int idx = real_i % (proj_size / 2); + int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2); + + int real_part_index = idx + head_idx * proj_size + + token_idx * hidden_size * QKV_WEIGHT_NUM + + hidden_size * (q_tensor ? 0 : 1); + int complex_part_index = real_part_index + (proj_size / 2); + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + // get the freq_cis: shape 1 * (qProjSize/2) = 1 * 64 + // apply a Cartesian coordinate transformation + // multiple with input & /copy back to q/k + + // get position of token + + // size_t pos = id_map[token_idx].token_position; + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + // float before_real = complex_input[i].x, before_complex = + int pos_i = real_i % (proj_size / 2); + + float freq = + pos * (1.0 / pow(rope_theta, (float)2 * pos_i / proj_size)); // θ_i + + if (llama3_rope) { + float pi = CUDART_PI_F; + float wavelen = 2 * pi / freq; + float low_freq_wavelen = + original_max_position_embeddings / low_freq_factor; + float high_freq_wavelen = + original_max_position_embeddings / high_freq_factor; + if (wavelen < high_freq_wavelen) { + } else if (wavelen > low_freq_wavelen) { + freq = freq / factor; + } else { + assert(low_freq_wavelen != high_freq_wavelen); + float smooth = + (original_max_position_embeddings / wavelen - low_freq_factor) / + (high_freq_factor - low_freq_factor); + freq = ((1 - smooth) * freq / factor + smooth * freq); } } - // Step 8: compute gradients w.r.t. input - { - float alpha = 1.0f, beta = 0.0f; - if (!m->reset_input_grads[0]) { - beta = 1.0f; - } - // matrix A: QKV projection weights - // matrix A's layout: [qSize, qProjSize * num_q_heads, 3] - DT const *A = weight_ptr; - // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) - // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] - DT const *B = static_cast
(m->devQKVProjArray); - // matrix C: gradients w.r.t. input - // matrix C's layout: [m->qSize, num_tokens] - DT *C = input_grad_ptr + - bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; - int m_ = m->qSize; - int n_ = num_tokens; - int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); - int lda = m_; - int ldb = n_; - int ldc = m_; - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n_, - k_, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - if (m->inference_debugging) { - std::string filename = - get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; - save_tensor(C, num_tokens * m->qSize, filename.c_str()); + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; + } +} + +template +__global__ void + apply_rotary_embedding_bwd(DT *input_ptr, + cuFloatComplex *complex_input, + BatchConfig::PerTokenInfo const *tokenInfos, + float rope_theta, + bool llama3_rope, + float factor, + float low_freq_factor, + float high_freq_factor, + int original_max_position_embeddings, + int proj_size, + int num_tokens, + int hidden_size) { + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + // compute indexes to visit first half proj_size of each of q/k tensor. + // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd + bool q_tensor = i < (num_tokens * hidden_size / 2); + int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2; + assert(hidden_size % proj_size == 0); + int num_heads = hidden_size / proj_size; + + int token_idx = real_i % num_tokens; + int idx = (real_i / num_tokens) % (proj_size / 2); + int head_idx = real_i / (num_tokens * proj_size / 2); + assert(head_idx < num_heads); + + int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size + + head_idx * num_tokens * proj_size + + idx * num_tokens + token_idx; + int real_part_index = complex_part_index + (proj_size / 2) * num_tokens; + + complex_input[i] = {input_ptr[real_part_index], + input_ptr[complex_part_index]}; + + size_t pos = tokenInfos[token_idx].abs_depth_in_request; + + float freq = + pos * (1.0 / pow(rope_theta, (float)2 * idx / proj_size)); // θ_i + + if (llama3_rope) { + float pi = CUDART_PI_F; + float wavelen = 2 * pi / freq; + float low_freq_wavelen = + original_max_position_embeddings / low_freq_factor; + float high_freq_wavelen = + original_max_position_embeddings / high_freq_factor; + if (wavelen < high_freq_wavelen) { + } else if (wavelen > low_freq_wavelen) { + freq = freq / factor; + } else { + assert(low_freq_wavelen != high_freq_wavelen); + float smooth = + (original_max_position_embeddings / wavelen - low_freq_factor) / + (high_freq_factor - low_freq_factor); + freq = ((1 - smooth) * freq / factor + smooth * freq); } } + + cuFloatComplex complex_pos = {cos(freq), sin(freq)}; + + complex_input[i] = cuCmulf(complex_input[i], complex_pos); + input_ptr[real_part_index] = complex_input[i].x; + input_ptr[complex_part_index] = complex_input[i].y; } } -} // namespace IncMultiHeadAttention -} // namespace Kernels +template +void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *output_ptr, + cudaStream_t stream) { -using namespace Kernels::IncMultiHeadAttention; + checkCUDA(cublasSetStream(m->handle.blas, stream)); + checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + assert(m->qSize == m->vSize && m->qSize == m->kSize); + + int num_tokens = bc->num_active_tokens(); + int parallelism = m->kProjSize * num_tokens * m->num_q_heads; + size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads; + + if (m->scaling_query) { + scaling_query_kernel<<>>(output_ptr, + m->qProjSize, + num_tokens, + m->num_q_heads, + m->scaling_factor, + m->hidden_size); + } + + // Step 3: apply rotary embedding if needed + if (m->rotary_embedding_meta->apply_rotary_embedding) { + /*q&k*/ + parallelism = num_tokens * m->hidden_size; + apply_rotary_embedding_hf<<>>( + output_ptr, + m->complex_input, + m->token_infos, + m->rotary_embedding_meta->rope_theta, + (m->rotary_embedding_meta->rope_type == "llama3"), + m->rotary_embedding_meta->factor, + m->rotary_embedding_meta->low_freq_factor, + m->rotary_embedding_meta->high_freq_factor, + m->rotary_embedding_meta->original_max_position_embeddings, + m->qProjSize, + m->kProjSize, + num_tokens, + q_array_size, + m->hidden_size); + } +} template -__global__ void store_kv_cache(DT const *devQKVProjArray, - DT *kCache_ptr, - DT *vCache_ptr, - BatchConfig::PerTokenInfo const *tokenInfos, - int num_tokens, - int max_seq_len, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - int offset = i % hidden_size; +void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + cudaStream_t stream) { + int num_tokens = bc->num_active_infr_tokens(); + if (num_tokens > 0) { + int parallelism = m->hidden_size * num_tokens; + store_kv_cache<<>>(static_cast
(m->devQKVProjArray), + static_cast
(m->keyCache), + static_cast
(m->valueCache), + m->token_infos, + num_tokens, + BatchConfig::max_sequence_length(), + m->hidden_size); + } +} - size_t val_idx = - token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset; +#define LAUNCH_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length(), \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos) - DT kVal = devQKVProjArray[val_idx]; - DT vVal = devQKVProjArray[val_idx + hidden_size]; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; +template +void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + DT *output_ptr, + cudaStream_t stream) { + dim3 grid(m->num_q_heads, bc->num_generation_tokens); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} - // key cache - kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = kVal; - vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size + - offset] = vVal; +std::string get_fwd_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); } template -__global__ void store_query_cache(DT const *devQKVProjArray, - DT *qCache_ptr, - int num_tokens, - int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { - int token_idx = i / hidden_size; - int offset = i % hidden_size; +void inference_kernel(IncMultiHeadSelfAttentionMeta *m, + BatchConfig const *bc, + int shard_id, + DT const *qkv_ptr, + DT *output_ptr, + cudaStream_t stream) { - size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset; + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); - DT qVal = devQKVProjArray[val_idx]; + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); - // query cache - qCache_ptr[i] = qVal; + // phase 1: Implement kernel to apply rotary embedding and scaling + compute_qkv_kernel( + m, bc, shard_id, static_cast
(m->devQKVProjArray), stream); + update_kv_cache_kernel
(m, bc, stream); + + if (bc->num_generation_tokens > 0) { + // phase 3: Compute attention score for generation tokens + compute_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } + + if (bc->num_tokens > bc->num_generation_tokens) { + // phase 4: Compute attention score for prompt tokens; + compute_attention_kernel_prompt
(m, bc, shard_id, stream); + } + + int num_tokens = bc->num_active_tokens(); + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); +} + +std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, + int shard_id) { + std::string op_name_without_uid = + IncMultiHeadSelfAttention::get_op_name_without_uid(m); + fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); +} + +__global__ void transposeAdd_half_kernel( + half *out, half const *in, int width, int height, half alpha, half beta) { + int t_id = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (int i = t_id; i < width * height; i += num_threads) { + int row = i / width; + int col = i % width; + out[col * height + row] = + alpha * in[row * width + col] + beta * out[col * height + row]; + } +} + +__global__ void transposeAdd_float_kernel(float *out, + float const *in, + int width, + int height, + float alpha, + float beta) { + int t_id = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (int i = t_id; i < width * height; i += num_threads) { + int row = i / width; + int col = i % width; + out[col * height + row] = + alpha * in[row * width + col] + beta * out[col * height + row]; } } template -void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, - BatchConfig const *bc, - int shard_id, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { +void transposeAdd(DT *out, + const DT *in, + int width, + int height, + float alpha, + float beta, + cudaStream_t stream) { + assert(false && "Unsupported data type"); +} + +template <> +void transposeAdd(float *out, + float const *in, + int width, + int height, + float alpha, + float beta, + cudaStream_t stream) { + transposeAdd_float_kernel<<<4, 1024, 0, stream>>>( + out, in, width, height, alpha, beta); +} + +template <> +void transposeAdd(half *out, + half const *in, + int width, + int height, + float alpha, + float beta, + cudaStream_t stream) { + transposeAdd_half_kernel<<<4, 1024, 0, stream>>>( + out, in, width, height, __float2half(alpha), __float2half(beta)); +} + +template +void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + int shard_id, + DT *input_grad_ptr, + DT const *output_grad_ptr, + cudaStream_t stream) { + assert(!m->offload); checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); cudaDataType_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_tokens(); - int tokens_previous_requests = 0; - int q_block_size = m->qProjSize; - int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); - assert(m->qProjSize == m->kProjSize); - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || - (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) { - continue; - } - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; - // Copy query to m->query_activation_buffer if we need to compute - // PEFT backward - if (bc->requestsInfo[i].peft_bwd) { - size_t activation_size_needed = - sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; - if (activation_size_needed > m->allocated_peft_buffer_size1) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->query_activation_buffer = - allocator->allocate_instance_untyped(activation_size_needed); - m->allocated_peft_buffer_size1 = activation_size_needed; - } - int parallelism = m->hidden_size * num_tokens; - store_query_cache<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->query_activation_buffer), - num_tokens, - m->hidden_size); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i]) { + continue; } - // Step 1: compute query-key product QK.T/sqrt(d_k) + if (!bc->requestsInfo[i].peft_bwd) { + continue; + } + int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + // Currently assume we are calculating gradients for all tokens + // of a request + assert(num_tokens == num_total_tokens); + int kt_block_size = m->kProjSize; + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_block_size = m->vProjSize; + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + // Step 1: copy gradient before final projection into workspace { - // Scale by sqrt(d_k) as per the original attention paper - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + int m_ = m->vProjSize * m->num_q_heads; + int n_ = num_tokens; + DT *C = static_cast
(m->handle.workSpace); + cudaMemcpyAsync(C, + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * + m->oProjSize, + m_ * n_ * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0"; + save_tensor(C, m_ * n_, filename.c_str()); } + } + // Step 2: compute gradients w.r.t. value + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: qk_prods_softmax + // matrix A's layout: [num_new_tokens, total_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods_softmax); + // matrix B: attn_heads gradients + // matrix B's layout: [vProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->handle.workSpace); + // matrix C: gradients for value (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray) + + 2 * num_tokens * + (m->qProjSize * m->num_q_heads); // skip over regions reserved + // for Q and K gradients // after transpositions - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; + int m_ = num_tokens; // total_tokens + int n_ = m->vProjSize; // num_new_tokens + int k_ = num_tokens; // num_new_tokens // before transpositions - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; + int lda = num_tokens; // num_new_tokens + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // total_tokens // N.B. strides are applied before transpose operations - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // matrix A: devQKVProjArray - // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens] - // To get query projection, skip over Q entries from previous requests - DT const *A = static_cast
(m->devQKVProjArray) + - bc->requestsInfo[i].first_token_offset_in_batch * - m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; - // matrix B: key cache - // matrix B's layout: [kProjSize * num_heads, total_tokens] - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // matrix C: qk_prods - // matrix C's layout: [num_new_tokens, total_tokens, num_heads] - // To get C, skip over QK.T products from previous requests - DT *C = static_cast
(m->qk_prods); + int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens + int strideB = m->vProjSize; + int strideC = num_tokens * m->vProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_T, - CUBLAS_OP_N, + CUBLAS_OP_T, m_, - n, - k, + n_, + k_, &alpha, A, cublas_data_type, @@ -1571,57 +1161,80 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // save result to file for checking + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0"; + save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax"; + save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str()); + } } - // Step 2: Add alibi position bias to qk production - // matrix C: qk_prods - // matrix C's layout: [num_new_tokens, total_tokens, num_heads] - // To get C, skip over QK.T products from previous requests - DT *C = static_cast
(m->qk_prods); - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } + // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor + { + float alpha = 1.0f, beta = 0.0f; + // matrix A: attn_heads gradients + // matrix A's layout: [vProjSize * num_heads, num_new_tokens] + DT const *A = static_cast
(m->handle.workSpace); + // matrix B: value cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->valueCache) + i * vt_req_block_size; + // matrix C: qk_prods_softmax gradients + // matrix C's layout: [num_new_tokens, total_tokens, num_heads] + DT *C = static_cast
(m->qk_prods_softmax); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = num_tokens; + int k_ = m->vProjSize; + // before transposition and striding + int lda = m->vProjSize * m->num_q_heads; + int ldb = m->vProjSize * m->num_q_heads; + int ldc = num_tokens; // num_new_tokens + int strideA = m->vProjSize; + int strideB = m->vProjSize; + int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens - // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods - // with -inf to force causal attention. - assert(num_new_tokens <= total_tokens); - size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2; - if (entries_above_diagonal > 0) { - size_t parallelism = m->num_q_heads * entries_above_diagonal; - fill_entries_above_diagonal<<>>(C, - num_new_tokens, - total_tokens, - m->num_q_heads, - entries_above_diagonal, - static_cast
(-INFINITY)); + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_N, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache"; + save_tensor( + B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str()); + } } - - // Step 4: Compute Softmax(QK.T/sqrt(d_k)) + // Step 4: softmax backpropagation { - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. + float alpha = 1.0f, beta = 0.0f; int n_param = m->num_q_heads; - int c_param = total_tokens; + int c_param = num_tokens; int h_param = 1; - int w_param = num_new_tokens; + int w_param = num_tokens; checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, CUDNN_TENSOR_NCHW, cudnn_data_type, @@ -1629,79 +1242,145 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, c_param, h_param, w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); + checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + m->qk_tensor, + m->softmax_activation_buffer, + m->qk_tensor, + m->qk_prods_softmax, + &beta, + m->qk_tensor, + m->qk_prods)); + + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } + + // TODO: fill all elements above diagonal to force causal attention + size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2; + if (entries_above_diagonal > 0) { + size_t parallelism = m->num_q_heads * entries_above_diagonal; + fill_entries_above_diagonal<<>>(static_cast
(m->qk_prods), + num_tokens, + num_tokens, + m->num_q_heads, + entries_above_diagonal, + DT(0.0f)); + } + if (m->inference_debugging) { + DT *C = static_cast
(m->qk_prods); + std::string filename = get_peft_dbg_folder(m, shard_id) + + ".qk_prods.softmax_grad_in.masked"; + save_tensor( + C, num_tokens * num_tokens * m->num_q_heads, filename.c_str()); + } } - // Copy C_softmax to m->softmax_activation_buffer if we need to compute - // PEFT backward - if (bc->requestsInfo[i].peft_bwd) { - DT *C_softmax = static_cast
(m->qk_prods_softmax); - size_t activation_size_needed = - sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; - if (activation_size_needed > m->allocated_peft_buffer_size2) { - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - m->softmax_activation_buffer = - allocator->allocate_instance_untyped(activation_size_needed); - m->allocated_peft_buffer_size2 = activation_size_needed; + // Step 5: compute gradients w.r.t. key + { + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: query activation (in query_activation_buffer) + // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens] + DT const *B = static_cast
(m->query_activation_buffer); + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + // after transposition & striding + int m_ = num_tokens; + int n_ = m->kProjSize; + int k_ = num_tokens; // num_new_tokens + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->kProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->kProjSize; + int strideC = num_tokens * m->kProjSize; + checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, + CUBLAS_OP_T, + CUBLAS_OP_T, + m_, + n_, + k_, + &alpha, + A, + cublas_data_type, + lda, + strideA, + B, + cublas_data_type, + ldb, + strideB, + &beta, + C, + cublas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".query_activation"; + save_tensor( + B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str()); + std::string filename2 = + get_peft_dbg_folder(m, shard_id) + ".devkproj_pre"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str()); } - checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer, - C_softmax, - sizeof(DT) * total_tokens * num_new_tokens * - m->num_q_heads, - cudaMemcpyDeviceToDevice, - stream)); } - // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ - // softmax(QK.T/sqrt(d_k)).T + // Step 6: compute gradients w.r.t query { - DT alpha = 1.0f, beta = 0.0f; - // after transpositions - int m_ = m->vProjSize; - int n = num_new_tokens; - int k = total_tokens; - // before transpositions - int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - // N.B. strides are applied before transpose operations - int strideA = vt_block_size; - int strideB = num_new_tokens * total_tokens; - int strideC = m->vProjSize; - // matrix A: value cache - // matrix A's layout: [vProjSize, num_heads, total_tokens] - // To get A, skip over V.T entries from previous requests (all heads + - // padding) - DT *A = static_cast
(m->valueCache) + i * vt_req_block_size; - // matrix B: qk_prods_softmax - // matrix B's layout: [num_new_tokens, total_tokens, num_heads] - // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous - // requests (all heads) - DT *B = static_cast
(m->qk_prods_softmax); - // matrix C: attn heads - // matrix C's layout: [vProjSize, num_heads, num_new_tokens] - // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous - // requests - // store the result attn heads, also skip the genration tokens - DT *C = static_cast
(m->attn_heads) + - (bc->requestsInfo[i].first_token_offset_in_batch) * - m->num_q_heads * m->vProjSize; + float alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = 1.0f / sqrt(m->kProjSize); + } + // matrix A: gradients w.r.t. qk_prods + // matrix A's layout: [num_new_tokens, num_tokens, num_heads] + DT const *A = static_cast
(m->qk_prods); + // matrix B: key cache + // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req] + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + // matrix C: gradients for query (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = static_cast
(m->devQKVProjArray); + // after transposition & striding + int m_ = num_tokens; // num_new_tokens + int n_ = m->qProjSize; + int k_ = num_tokens; + // before transposition and striding + int lda = num_tokens; // num_new_tokens + int ldb = m->qProjSize * m->num_q_heads; + int ldc = num_tokens; + int strideA = num_tokens * num_tokens; + int strideB = m->qProjSize; + int strideC = num_tokens * m->qProjSize; checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, m_, - n, - k, + n_, + k_, &alpha, A, cublas_data_type, @@ -1719,30 +1398,109 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // Step 7: perform rotary position embeddings (RoPE) bwd + { + if (m->rotary_embedding_meta->apply_rotary_embedding) { + assert(m->hidden_size == m->qProjSize * m->num_q_heads); + assert(m->qProjSize == m->kProjSize); + /*q&k*/ + int parallelism = num_tokens * m->hidden_size; + DT *A = static_cast
(m->devQKVProjArray); + apply_rotary_embedding_bwd<<>>( + A, + m->complex_input, + m->token_infos, + m->rotary_embedding_meta->rope_theta, + (m->rotary_embedding_meta->rope_type == "llama3"), + m->rotary_embedding_meta->factor, + m->rotary_embedding_meta->low_freq_factor, + m->rotary_embedding_meta->high_freq_factor, + m->rotary_embedding_meta->original_max_position_embeddings, + m->qProjSize, + num_tokens, + m->hidden_size); + DT *C = static_cast
(m->devQKVProjArray); + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray"; + save_tensor(C, + num_tokens * m->qProjSize * m->num_q_heads * 3, + filename.c_str()); + } + } + + // matrix C: gradients for key (saved as part of m->devQKVProjArray) + // matrix C's layout: [num_tokens, qProjsize * num_heads, 3] + DT *C = + static_cast
(m->devQKVProjArray) + + num_tokens * + (m->qProjSize * + m->num_q_heads); // skip over regions reserved for Q gradients + if (m->inference_debugging) { + std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj"; + save_tensor( + C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str()); + } + } + + // Step 8: compute gradients w.r.t. input + { + float alpha = 1.0f, beta = 0.0f; + if (!m->reset_input_grads[0]) { + beta = 1.0f; + } + // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray) + // matrix B's layout: [num_tokens, qProjsize * num_heads, 3] + DT const *B = static_cast
(m->devQKVProjArray); + // matrix C: gradients w.r.t. input + // matrix C's layout: [m->qSize, num_tokens] + DT *C = input_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->qSize; + // int m_ = m->qSize; + int n_ = num_tokens; + int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize); + + // The original version uses existing result and attention's projection to + // do further calculation in a way different than the usual dense layer, + // they are off by a transpose. So an explicit transpose is needed here. + // The add here is just for gradient accumulation. + transposeAdd(C, B, n_, k_, alpha, beta, stream); + + if (m->inference_debugging) { + std::string filename = + get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0"; + save_tensor(C, num_tokens * m->qSize, filename.c_str()); + } } - tokens_previous_requests += num_new_tokens; - } - if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { - bc->print(); - printf("tokens_previous_requests: %i\n", tokens_previous_requests); - printf("num_tokens: %i\n", num_tokens); - printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); } - assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } +} // namespace IncMultiHeadAttention +} // namespace Kernels + +using namespace Kernels::IncMultiHeadAttention; + /*static*/ void IncMultiHeadSelfAttention::inference_kernel_wrapper( IncMultiHeadSelfAttentionMeta *m, BatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -1751,43 +1509,14 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_half_ptr(), - m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream); } else if (input.data_type == DT_FLOAT) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_float_ptr(), - m->offload ? static_cast(m->weight_ptr) - : weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream); } else { assert(false && "Unspported data type"); } @@ -1809,12 +1538,9 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( BatchConfig const *bc, int shard_id, GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &weight, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorR const &output_grad) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -1823,35 +1549,23 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( cudaEventRecord(t_start, stream); } - // assert(input.data_type == weight.data_type); assert(input_grad.data_type == output_grad.data_type); - if (use_bias) { - assert(input_grad.data_type == bias.data_type); - } if (input_grad.data_type == DT_HALF) { assert(!m->offload); - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, bc, shard_id, input_grad.get_half_ptr(), - weight.get_half_ptr(), output_grad.get_half_ptr(), - bias_ptr, stream); } else if (input_grad.data_type == DT_FLOAT) { assert(!m->offload); - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::IncMultiHeadAttention::peft_bwd_kernel(m, bc, shard_id, input_grad.get_float_ptr(), - weight.get_float_ptr(), output_grad.get_float_ptr(), - bias_ptr, stream); } else { assert(false && "Unspported data type"); @@ -1870,7 +1584,6 @@ void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper( IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( FFHandler handler, IncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, @@ -1885,14 +1598,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, + attn->rotary_embedding_meta, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->final_bias, attn->scaling_factor, - weight, gpu_mem_allocator, num_samples, attn->num_q_heads, @@ -1913,14 +1623,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _kProjSize, int _vProjSize, int _oProjSize, - bool _apply_rotary_embedding, - bool _qkv_bias, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, bool _qk_prod_scaling, bool _position_bias, - bool _final_bias, float _scaling_factor, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _global_num_q_heads, @@ -1929,7 +1636,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( int _num_kv_heads, DataType _quantization_type, bool _offload) - : OpMeta(handler, attn), weight_ptr(nullptr), bias_ptr(nullptr) { + : OpMeta(handler, attn) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); checkCUDNN(cudnnSetStream(handler.dnn, stream)); @@ -1955,29 +1662,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( num_kv_heads = _num_kv_heads; hidden_size = num_q_heads * qProjSize; - weightSize = - ((qSize * qProjSize + oProjSize * (vProjSize > 0 ? vProjSize : vSize)) * - num_q_heads + - (kSize * kProjSize + vSize * vProjSize) * num_q_heads) * - size_of_dt; - if (quantization_type != DT_NONE) { - quantized_weightSize = get_quantization_to_byte_size( - attn->data_type, quantization_type, weightSize); - } - // biasSize = _bias ? oProjSize * size_of_dt * 4 : 0; - - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int final_bias_size = oProjSize; - biasSize = - (_qkv_bias ? qkv_bias_size : 0) + (final_bias ? final_bias_size : 0); - - // has_load_weights = (bool *)calloc(1, sizeof(bool)); - //*has_load_weights = false; - apply_rotary_embedding = (bool *)calloc(1, sizeof(bool)); - *apply_rotary_embedding = _apply_rotary_embedding; - qkv_bias = (bool *)calloc(1, sizeof(bool)); - *qkv_bias = _qkv_bias; + rotary_embedding_meta = + (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta)); + *rotary_embedding_meta = _rotary_embedding_meta; scaling_query = (bool *)calloc(1, sizeof(bool)); *scaling_query = _scaling_query; scaling_factor = _scaling_factor; @@ -1985,14 +1672,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( *qk_prod_scaling = _qk_prod_scaling; position_bias = (bool *)calloc(1, sizeof(bool)); *position_bias = _position_bias; - final_bias = (bool *)calloc(1, sizeof(bool)); - *final_bias = _final_bias; - - // allocate weight and bias in the reserve space for cpu offloading - if (offload) { - weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(weightSize); - bias_ptr = gpu_mem_allocator.allocate_reserved_untyped(biasSize); - } // allocate memory for the seqArray and reserve space { @@ -2058,9 +1737,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( ? key_cache_size + value_cache_size + qkv_max_proj_size : key_cache_size + value_cache_size); - if (quantization_type != DT_NONE) { - totalSharedSize += quantized_weightSize; - } assert(gpu_mem_allocator.reserved_total_size - gpu_mem_allocator.reserved_allocated_size >= totalSharedSize); @@ -2091,29 +1767,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( handler.batch_config_metadata->requestsInfo); if (offload) { - // token_infos = - // gpu_mem_allocator.allocate_reserved( - // tokeninfo_size); - // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size; qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size * size_of_dt); - // offset += qk_prod_size * size_of_dt; qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped( qk_prod_size * size_of_dt); - // offset += qk_prod_size * size_of_dt; attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size * size_of_dt); - // offset += attn_heads_size * size_of_dt; complex_input = gpu_mem_allocator.allocate_reserved(complex_size); - // offset += complex_size * sizeof(cuFloatComplex); - // request_infos = - // gpu_mem_allocator.allocate_reserved( - // requestinfo_size); } else { - // token_infos = - // gpu_mem_allocator.allocate_instance( - // tokeninfo_size); qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size * size_of_dt); qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped( @@ -2122,16 +1784,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_of_dt); complex_input = gpu_mem_allocator.allocate_instance(complex_size); - // request_infos = - // gpu_mem_allocator.allocate_instance( - // requestinfo_size); } // allocate more size for quantization data if (quantization_type != DT_NONE) { assert(offload); - quantized_weight_ptr = - gpu_mem_allocator.allocate_reserved(quantized_weightSize); } if (!offload) { assert(gpu_mem_allocator.reserved_total_size == @@ -2149,49 +1806,32 @@ IncMultiHeadSelfAttentionMeta::~IncMultiHeadSelfAttentionMeta(void) { } } -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( - IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - cudaStream_t stream); +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + float *output_ptr, + cudaStream_t stream); -template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel( - IncMultiHeadSelfAttentionMeta const *m, - GenericTensorAccessorR const weight, - DataType data_type, - cudaStream_t stream); +template void + Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( + IncMultiHeadSelfAttentionMeta const *m, + BatchConfig const *bc, + half *output_ptr, + cudaStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( +template void Kernels::IncMultiHeadAttention::compute_qkv_kernel( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, float *output_ptr, - float const *weight_ptr, - float const *bias_ptr, - int num_tokens, cudaStream_t stream); -template void Kernels::IncMultiHeadAttention::compute_o_prod_bias( +template void Kernels::IncMultiHeadAttention::compute_qkv_kernel( IncMultiHeadSelfAttentionMeta const *m, BatchConfig const *bc, int shard_id, half *output_ptr, - half const *weight_ptr, - half const *bias_ptr, - int num_tokens, cudaStream_t stream); -template void - Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - float *output_ptr, - cudaStream_t stream); - -template void - Kernels::IncMultiHeadAttention::compute_attention_kernel_generation( - IncMultiHeadSelfAttentionMeta const *m, - BatchConfig const *bc, - half *output_ptr, - cudaStream_t stream); }; // namespace FlexFlow diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index d4f930db6c..3835d258e0 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -511,6 +511,7 @@ void forward_kernel(LinearMeta const *m, out_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + // use_bias = True if (bias_ptr != NULL) { // fuse bias and relu diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 20ad762b62..09170d3c28 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -668,11 +668,11 @@ void Linear::inference_task(Task const *task, } Linear::save_inference_tensors_to_file( m, shard_id, bc, {input}, weights_accessors, {output}); - printf("\tin=[%i,%i].T @ w=[%i,%i] -> out=[%i,%i]\n", - in_dim, - bc->num_tokens, + printf("\tw=[%i,%i].T @ in=[%i,%i] -> out=[%i,%i]\n", in_dim, out_dim, + in_dim, + bc->num_tokens, out_dim, bc->num_tokens); } diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc index 2a30d12d6d..ce4150f9d6 100644 --- a/src/ops/residual_layer_norm.cc +++ b/src/ops/residual_layer_norm.cc @@ -988,9 +988,20 @@ void ResidualLayerNorm::inference_task( return; } - assert(regions.size() == - 3 + m->use_two_residuals + - (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0)); + int expected_num_regions = 4; // input, residual1, added_output, output + if (m->use_two_residuals) { + expected_num_regions++; // residual2 + } + if (m->inplace_residual) { + expected_num_regions--; // added_output = input + } + if (m->elementwise_affine) { + expected_num_regions += 1; // gamma + if (m->use_bias) { + expected_num_regions += 1; // beta + } + } + assert(regions.size() == expected_num_regions); int region_idx = 0, task_region_idx = 0; GenericTensorAccessorR input = diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index 52da51fb26..aa74ecc6f5 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -52,24 +52,22 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid( return is_valid; } -Tensor - FFModel::spec_inc_multihead_self_attention(Tensor const input, - int embed_dim, - int num_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { +Tensor FFModel::spec_inc_multihead_self_attention( + Tensor const input, + int embed_dim, + int num_heads, + int kdim, + int vdim, + float dropout, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { return spec_inc_multiquery_self_attention(input, embed_dim, num_heads, @@ -77,12 +75,10 @@ Tensor kdim, vdim, dropout, - qkv_bias, - final_bias, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -90,30 +86,27 @@ Tensor name); } -Tensor - FFModel::spec_inc_multiquery_self_attention(Tensor const input, - int embed_dim, - int num_q_heads, - int num_kv_heads, - int kdim, - int vdim, - float dropout, - bool qkv_bias, - bool final_bias, - bool add_zero_attn, - DataType data_type, - Initializer *kernel_initializer, - bool apply_rotary_embedding, - bool scaling_query, - float scaling_factor, - bool qk_prod_scaling, - bool position_bias, - char const *name) { +Tensor FFModel::spec_inc_multiquery_self_attention( + Tensor const input, + int embed_dim, + int num_q_heads, + int num_kv_heads, + int kdim, + int vdim, + float dropout, + bool add_zero_attn, + DataType data_type, + Initializer *kernel_initializer, + RotaryEmbeddingMeta rotary_embedding_meta, + bool scaling_query, + float scaling_factor, + bool qk_prod_scaling, + bool position_bias, + char const *name) { if (data_type == DT_NONE) { data_type = input->data_type; } Layer *li = nullptr; - int weight_num = (qkv_bias || final_bias) ? 2 : 1; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); li = new Layer(this, @@ -121,7 +114,7 @@ Tensor data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0 /*weights*/, 1 /*outputs*/, casted_input); } else { @@ -130,7 +123,7 @@ Tensor data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0 /*weights*/, 1 /*outputs*/, input); } @@ -144,51 +137,26 @@ Tensor li->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, li, 0, true /*create_grad*/); } - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int weight_size = qParas * num_q_heads + kParas * num_q_heads + - vParas * num_q_heads + oParas * num_q_heads; - { - int dims[1] = {weight_size}; - li->weights[0] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (qkv_bias || final_bias) { - // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; - li->weights[1] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } + li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_q_heads", num_q_heads); li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); - li->add_int_property("qkv_bias", qkv_bias); - li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("apply_rotary_embedding", + rotary_embedding_meta.apply_rotary_embedding); + li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta); + li->add_string_property("rope_type", rotary_embedding_meta.rope_type); + li->add_float_property("factor", rotary_embedding_meta.factor); + li->add_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + li->add_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + li->add_int_property("original_max_position_embeddings", + rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); li->add_int_property("qk_prod_scaling", qk_prod_scaling); @@ -216,14 +184,20 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( int vdim = value; float dropout; layer->get_float_property("dropout", dropout); - layer->get_int_property("qkv_bias", value); - bool qkv_bias = (bool)value; - layer->get_int_property("final_bias", value); - bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; + rotary_embedding_meta.apply_rotary_embedding = (bool)value; + layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta); + layer->get_string_property("rope_type", rotary_embedding_meta.rope_type); + layer->get_float_property("factor", rotary_embedding_meta.factor); + layer->get_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + layer->get_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + layer->get_int_property("original_max_position_embeddings", value); + rotary_embedding_meta.original_max_position_embeddings = (int)value; layer->get_int_property("scaling_query", value); bool scaling_query = (bool)value; float scaling_factor; @@ -242,15 +216,12 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( kdim, vdim, dropout, - qkv_bias, - final_bias, add_zero_attn, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, - false /*allocate_weights*/, layer->name); } @@ -264,29 +235,24 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, char const *name) - // Initializer* _bias_initializer) : Op(model, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0, 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), + rotary_embedding_meta(_rotary_embedding_meta), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), @@ -305,99 +271,44 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0].size = _embed_dim; // Currently require no parallelism along this dim assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* // Check correctness */ - /* assert(check_output_input_weight_parallel_dims()); */ } SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, ParallelTensor const _input, - ParallelTensor const _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, char const *name) - // Initializer* _bias_initializer) : Op(model, OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION, _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0 /*weights*/, 1 /*outputs*/, - _input, - _weight), + _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), + rotary_embedding_meta(_rotary_embedding_meta), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size), scaling_query(_scaling_query), scaling_factor(_scaling_factor), - qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) -// bias_initializer(_bias_initializer) -{ + qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) { numOutputs = 1; int numdim = _input->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; @@ -407,66 +318,15 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( dims[0].size = _embed_dim; // Currently require no parallelism along this dim assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - // dims[2].size = qParas + kParas + vParas + oParas; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>(dims, - this->data_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); - - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ - /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ - // Check correctness - /* assert(check_output_input_weight_parallel_dims()); */ } SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, SpecIncMultiHeadSelfAttention const &other, - ParallelTensor const input, - bool allocate_weights) + ParallelTensor const input) : SpecIncMultiHeadSelfAttention(model, other.layer_guid, input, @@ -476,22 +336,18 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( other.qProjSize, other.vProjSize, other.dropout, - other.qkv_bias, - other.final_bias, other.add_zero_attn, - other.apply_rotary_embedding, + other.rotary_embedding_meta, other.scaling_query, other.scaling_factor, other.qk_prod_scaling, other.position_bias, - allocate_weights, other.name) {} SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( FFModel &model, SpecIncMultiHeadSelfAttentionParams const ¶ms, ParallelTensor const &input, - bool allocate_weights, char const *name) : SpecIncMultiHeadSelfAttention(model, params.layer_guid, @@ -502,15 +358,12 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention( params.kdim, params.vdim, params.dropout, - params.qkv_bias, - params.final_bias, params.add_zero_attn, - params.apply_rotary_embedding, + params.rotary_embedding_meta, params.scaling_query, params.scaling_factor, params.qk_prod_scaling, params.position_bias, - allocate_weights, params.name) {} void SpecIncMultiHeadSelfAttention::init_inference( @@ -541,18 +394,12 @@ void SpecIncMultiHeadSelfAttention::init_inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -580,18 +427,12 @@ void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -599,8 +440,7 @@ void SpecIncMultiHeadSelfAttention::init(FFModel const &ff) { /* regions[0](I): input - regions[1](I): weight - regions[2](O): output + regions[1](O): output */ OpMeta *SpecIncMultiHeadSelfAttention::init_task( Task const *task, @@ -618,17 +458,10 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], + regions[1], + task->regions[1], FID_DATA, ctx, runtime); @@ -643,14 +476,8 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); // We don't do offloading for SSMs (small speculative models) - SpecIncMultiHeadSelfAttentionMeta *m = - new SpecIncMultiHeadSelfAttentionMeta(handle, - attn, - weight, - gpu_mem_allocator, - num_samples, - num_q_heads, - num_kv_heads); + SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta( + handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads); // assert that we didn't over allocate memory assert(gpu_mem_allocator.instance_allocated_size == gpu_mem_allocator.instance_total_size); @@ -658,8 +485,6 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task( m->inference_debugging = attn->inference_debugging; std::strcpy(m->op_name, attn->name); m->layer_guid = attn->layer_guid; - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); return m; } @@ -697,12 +522,6 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, @@ -710,21 +529,12 @@ FutureMap SpecIncMultiHeadSelfAttention::inference( batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (qkv_bias || final_bias) { - launcher.add_region_requirement(RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region)); - launcher.add_field(idx++, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } /* regions[0](I): input - regions[3](I): weight - regions[4](O): output + regions[1](O): output */ void SpecIncMultiHeadSelfAttention::inference_task( Task const *task, @@ -741,51 +551,29 @@ void SpecIncMultiHeadSelfAttention::inference_task( SpecIncMultiHeadSelfAttentionMeta *m = *((SpecIncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); + assert(regions.size() == 2); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); + Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[1].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); assert(task->index_point.get_dim() == 1); SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); + m, &bc, task->index_point.point_data[0], input, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - std::vector weights_accessors; - weights_accessors.push_back(weight); - if (*m->qkv_bias || *m->final_bias) { - weights_accessors.push_back(biases); - } SpecIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); + m, shard_id, &bc, {input}, {}, {output}); } } @@ -809,8 +597,7 @@ Op *SpecIncMultiHeadSelfAttention::materialize(FFModel &ff, ParallelTensor inputs[], int num_inputs) const { SpecIncMultiHeadSelfAttentionParams params = get_params(); - return new SpecIncMultiHeadSelfAttention( - ff, params, inputs[0], true, this->name); + return new SpecIncMultiHeadSelfAttention(ff, params, inputs[0], this->name); } bool SpecIncMultiHeadSelfAttention::measure_operator_cost( @@ -823,9 +610,20 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.rotary_embedding_meta.apply_rotary_embedding == + rhs.rotary_embedding_meta.apply_rotary_embedding && + lhs.rotary_embedding_meta.rope_theta == + rhs.rotary_embedding_meta.rope_theta && + lhs.rotary_embedding_meta.rope_type == + rhs.rotary_embedding_meta.rope_type && + lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor && + lhs.rotary_embedding_meta.low_freq_factor == + rhs.rotary_embedding_meta.low_freq_factor && + lhs.rotary_embedding_meta.high_freq_factor == + rhs.rotary_embedding_meta.high_freq_factor && + lhs.rotary_embedding_meta.original_max_position_embeddings == + rhs.rotary_embedding_meta.original_max_position_embeddings && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && lhs.qk_prod_scaling == rhs.qk_prod_scaling && @@ -842,10 +640,8 @@ SpecIncMultiHeadSelfAttentionParams params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; - params.qkv_bias = this->qkv_bias; - params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; + params.rotary_embedding_meta = this->rotary_embedding_meta; params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; @@ -870,10 +666,15 @@ size_t hash::operator()( hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); - hash_combine(key, params.qkv_bias); - hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.rope_theta); + hash_combine(key, params.rotary_embedding_meta.rope_type); + hash_combine(key, params.rotary_embedding_meta.factor); + hash_combine(key, params.rotary_embedding_meta.low_freq_factor); + hash_combine(key, params.rotary_embedding_meta.high_freq_factor); + hash_combine(key, + params.rotary_embedding_meta.original_max_position_embeddings); hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index aebd5e8892..b2f4e35d5e 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -16,6 +16,7 @@ #include "flexflow/ops/spec_inc_multihead_self_attention.h" #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" +#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" #include "flexflow/utils/hip_helper.h" #include #include @@ -26,13 +27,310 @@ namespace FlexFlow { using Legion::coord_t; using Legion::Memory; +#define WARP_SIZE 32 + using namespace Kernels::IncMultiHeadAttention; namespace Kernels { -namespace SpecIncMultiHeadAttention { +namespace SpecIncMultiHeadSelfAttention { + +template +__device__ __forceinline__ T + WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_sync(mask, var, srcLane, width); +#else + return __shfl(var, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T + WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(mask, var, laneMask, width); +#else + return __shfl_xor(var, laneMask, width); +#endif +} + +template +__global__ void compute_spec_inc_attention_kernel_generation_kernel( + DT const *query, + DT const *key_cache, + DT const *value_cache, + DT *output_ptr, + float const scale, + int const max_seq_length, + int per_head_size, + int hidden_size, + BatchConfig::PerRequestInfo *request_infos, + BeamSearchBatchConfig::BeamSearchPerRequestInfo *beam_request_infos, + BatchConfig::BitMask *causalMask, + bool *request_completed) { + + // q, k + using Q_vec = typename VEC_K::Type; + using K_vec = typename VEC_K::Type; + using V_vec = typename VEC_V
::Type; + using Out_sum = typename Vec_fp32_::Type; + + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT); + constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY; + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT); + + // thread id + int const tidx = threadIdx.x; + // head id + int const head_idx = blockIdx.x; + // nth request idx + int const request_idx = blockIdx.y; + + // request id in batch config + int const batch_config_request_id = + request_infos[request_idx].batch_config_request_id; + + // request_idx = re + + BatchConfig::BitMask bitmask = causalMask[batch_config_request_id]; + + int const first_step = 0; + + // int const tlength = + // request_infos[batch_config_request_id].first_token_depth_in_request + + // request_infos[batch_config_request_id].num_tokens_in_batch; + + int const totalCacheSize = + bitmask.non_tree_cache_size + bitmask.tree_size + bitmask.prompt_size - 1; + + int first_token_idx = 0; + for (int r = 0; r < batch_config_request_id; r++) { + first_token_idx += request_completed[r] ? 0 : causalMask[r].this_layer_size; + } + + int const tree_branch_num = + beam_request_infos[batch_config_request_id].sub_request_num; + + // shared memory objects + extern __shared__ char smem_[]; + + float *qk_smem = reinterpret_cast(smem_); + float *out_smem = reinterpret_cast(smem_); + + float qk_max = -FLT_MAX; + + // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM + + head_idx * per_head_size; + __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD]; + + // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + int ki_o = tidx % THREADS_PER_KEY; + // the first key's offset for this thread + // ko = 0, 0, 0, 0, 1, 1, 1, 1, .... + int ko = tidx / THREADS_PER_KEY; + // load q tensor + Q_vec q_vec[K_VECS_PER_THREAD]; + + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + DT const *k_cache_batch = + key_cache + batch_config_request_id * max_seq_length * hidden_size + ki; + + int ti_end = + div_up(totalCacheSize - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + for (int qi = 0; qi < tree_branch_num; qi += 1) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vecs[ki_o][ii] = *reinterpret_cast( + q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE); + } + + int const query_token = + bitmask.prompt_size + bitmask.tree_size - 1 - tree_branch_num + qi; + + __syncthreads(); + for (int ti = ko; ti < ti_end; ti += K_PER_ITER) { + K_vec k[K_VECS_PER_THREAD]; + int const ti_circ = ti % max_seq_length; + + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * THREADS_PER_KEY * K_VEC_SIZE; + if (ti < totalCacheSize) { + + k[ii] = *reinterpret_cast( + k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size + + jj); + } + } + float qk = scale * Qk_dot::dot(q_vecs[ki_o], k); + + if (ti < totalCacheSize && tidx % THREADS_PER_KEY == 0) { + // todo add alobi here + // bool const mask = ti_circ >= totalCacheSize; + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + + // if (head_idx == 0 && ti == 0 && request_idx == 15 && !mask) { + // printf("spec inc attn qkqkqk request id %d, %.10f, %d\n", + // batch_config_request_id, + // ti, + // qk, + // qi); + // } + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = mask ? 0.f : qk; + } + } + + __syncthreads(); + +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + int const warp = tidx / WARP_SIZE; + int const lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0); + + // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) { + // printf("spec inc attn first token qk_max %.10f\n", qk_max); + // } + + float exp_sum = 0.f; + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max); + exp_sum += logit; + qk_smem[ti - first_step] = mask ? 0.0f : logit; + } + + // Compute the sum. + exp_sum = block_sum(&red_smem[WARPS_PER_BLOCK], exp_sum); + + // softmax + float inv_sum = __fdividef(1.f, exp_sum + 1.e-6); + for (int ti = first_step + tidx; ti < totalCacheSize; + ti += THREADS_PER_BLOCK) { + qk_smem[ti - first_step] *= inv_sum; + } + + __syncthreads(); + + // value projection + constexpr int V_VEC_SIZE = 16 / sizeof(DT); + // A vector of V elements for the current timestep. + // using V_vec_k = typename V_vec_k_::Type; + // using V_vec_acum = typename V_vec_acum_fp32_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + Out_sum out; + zero(out); + + // The base pointer for the value in the cache buffer. + DT const *v_cache_batch = + value_cache + batch_config_request_id * max_seq_length * hidden_size + + vi; + + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < totalCacheSize; ti += V_PER_ITER) { + // Load the values from the cache. + int const ti_circ = ti % max_seq_length; + V_vec v = *reinterpret_cast( + v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size); + + bool const mask = (ti >= bitmask.non_tree_cache_size && + (!(bitmask.mask[ti - bitmask.non_tree_cache_size] & + (1 << query_token)))); + float logit = mask ? 0.0f : qk_smem[ti - first_step]; + out = FlexFlow::fma(logit, cast_to_float(v), out); + } + } + + // // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different + // partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; + active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { + *reinterpret_cast(out_smem + (vo - midpoint) * Dh + vi) = + out; + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(out_smem + vo * Dh + vi), + out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { + convert_from_float(*reinterpret_cast( + output_ptr + (first_token_idx + qi) * hidden_size + + head_idx * per_head_size + vi), + out); + } + } +} template -__global__ void spec_store_kv_cache( +__global__ void spec_inc_store_kv_cache( DT const *devQKVProjArray, DT *kCache_ptr, DT *vCache_ptr, @@ -40,16 +338,16 @@ __global__ void spec_store_kv_cache( BatchConfig::PerRequestInfo *requestInfo, BeamSearchBatchConfig::BeamSearchPerTokenInfo *beamTokenInfos, BeamSearchBatchConfig::BeamSearchPerRequestInfo *beamRequestInfos, + BatchConfig::BitMask *causalMask, int qProjSize, int kProjSize, int vProjSize, int num_tokens, int max_seq_len, - int max_beam_width, bool is_root, int hidden_size) { - CUDA_KERNEL_LOOP(i, num_tokens * hidden_size * 2) { - int token_idx = i / (hidden_size * KV_WEIGHT_NUM); + CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) { + int token_idx = i / (hidden_size); int offset = i % hidden_size; size_t val_idx = @@ -58,82 +356,25 @@ __global__ void spec_store_kv_cache( DT kVal = devQKVProjArray[val_idx]; DT vVal = devQKVProjArray[val_idx + hidden_size]; - // above no need to be changed - // int const req_id = id_map[token_idx].request_index; - // int const tok_id = id_map[token_idx].token_position; - // int const sub_req_id = id_map[token_idx].sub_request_index; - // int const parent_id = id_map[token_idx].parent_id; - // int const beam_depth = id_map[token_idx].beam_depth; - // int const beam_width = id_map[token_idx].beam_width; - int const req_id = tokenInfos[token_idx].request_index; - int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - int const sub_req_id = beamTokenInfos[token_idx].sub_request_index; - int const parent_id = beamRequestInfos[req_id].parent_id[sub_req_id]; - int const beam_depth = beamRequestInfos[req_id].current_depth; - int const beam_width = beamRequestInfos[req_id].beam_size; - - // new token - kCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - - // replica in the root iteration - if (beam_depth == 1) { - for (int i = 1; i < beam_width; i++) { - kCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = kVal; - vCache_ptr[(req_id * max_beam_width + i) * (hidden_size * max_seq_len) + - tok_id * hidden_size + offset] = vVal; - } - } + // int const tok_id = tokenInfos[token_idx].abs_depth_in_request; - // naive cache stealing - if (sub_req_id != parent_id) { - if (offset == 0 && tok_id == 0) { - printf("cache stealing!, depth %d req_id %d sub_req_id %d, parentid " - "%d, tok_id %d\n", - beam_depth, - req_id, - sub_req_id, - parent_id, - tok_id); - } + int const request_token_offset = + requestInfo[req_id].first_token_offset_in_batch; - for (int depth = 0; depth < beam_depth; depth++) { - int steal_token_idx = tok_id - beam_depth + depth; - int steal_from_idx = (req_id * max_beam_width + parent_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - int steal_to_idx = (req_id * max_beam_width + sub_req_id) * - (hidden_size * max_seq_len) + - steal_token_idx * hidden_size + offset; - kCache_ptr[steal_to_idx] = kCache_ptr[steal_from_idx]; - vCache_ptr[steal_to_idx] = vCache_ptr[steal_from_idx]; - - // if(data_idx == 0 && head_idx == 0 && k_cache && req_id == 1){ - // printf("cache stealing kernel!, steal_token_idx %d\n", - // steal_token_idx); - // } - } - } + BatchConfig::BitMask bitmask = causalMask[req_id]; - // parallel cache stealing not yet implemented - // logic shld be - // launch spec_store_kv_cache with parallelism * current depth - // from the i here, get depth index - // if depth index not the current one, check if we need to steal - // steal if needed - - // cache stealing theory - // identify which sub request does this token come from - // for initial token, 0 - // for other, may 0,0,1/ 0,1,2/ 1,1,1 to get which cache to be reuse and - // which to be delete copy beam_size bunch of blocks when sub_req_id == - // parent_id : like 0 -> 0, 1->1, 2->2, do nothing, just append the new k/v + // if prompt token -> token id + // if tree token: + + int const cache_idx = bitmask.prompt_size + bitmask.non_tree_cache_size + + bitmask.tree_size - 1 - bitmask.this_layer_size + + token_idx - request_token_offset; + + kCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = kVal; + vCache_ptr[req_id * (hidden_size * max_seq_len) + (cache_idx)*hidden_size + + offset] = vVal; } } @@ -143,11 +384,9 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, hipStream_t stream) { int num_tokens = bc->num_active_infr_tokens(); int curr_depth = bc->beamRequestsInfo[0].current_depth; - // printf("curr depth: %d\n", curr_depth); - // assert(curr_depth < 3); if (num_tokens > 0) { int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_store_kv_cache
), + hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_inc_store_kv_cache
), GET_BLOCKS(parallelism), min(CUDA_NUM_THREADS, parallelism), 0, @@ -159,17 +398,71 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, m->request_infos, m->beam_token_infos, m->beam_request_infos, + m->causalMask, m->qProjSize, m->kProjSize, m->vProjSize, num_tokens, - BatchConfig::max_sequence_length(), - BeamSearchBatchConfig::MAX_BEAM_WIDTH, + BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num(), /*root*/ curr_depth == 0, m->hidden_size); } } +#define LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( \ + DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream) \ + smem_sz = smem_size_in_bytes
(m->qProjSize, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::max_spec_tree_token_num(), \ + THREADS_PER_VALUE, \ + THDS_PER_BLOCK); \ + compute_spec_inc_attention_kernel_generation_kernel \ + <<>>( \ + static_cast
(m->devQKVProjArray), \ + static_cast
(m->keyCache), \ + static_cast
(m->valueCache), \ + output_ptr, \ + scale, \ + BatchConfig::max_sequence_length() + \ + BatchConfig::max_spec_tree_token_num(), \ + m->qProjSize, \ + m->hidden_size, \ + m->request_infos, \ + m->beam_request_infos, \ + m->causalMask, \ + m->request_completed) + +template +void compute_spec_inc_attention_kernel_generation( + SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + DT *output_ptr, + hipStream_t stream) { + // one block == one head per request + // how many generation requests + dim3 grid(m->num_q_heads, bc->get_speculative_request_num()); + int const per_head_size = m->qProjSize; + float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; + size_t smem_sz; + if (per_head_size == 64) { + constexpr int THREADS_PER_VALUE_64 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream); + } else if (per_head_size == 128) { + constexpr int THREADS_PER_VALUE_128 = threads_per_value_t::value; + LAUNCH_SPEC_INC_ATTENTION_SCORE_KERNEL( + DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream); + } else { + assert(false && "a unsupported head size"); + } +} + template __global__ void spec_fill_entries_above_diagonal(DT *matrix, size_t new_tokens, @@ -188,331 +481,268 @@ __global__ void spec_fill_entries_above_diagonal(DT *matrix, } template -void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, - BeamSearchBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - hipStream_t stream) { +void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, + BeamSearchBatchConfig const *bc, + int shard_id, + DT *output_ptr, + hipStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); hipblasDatatype_t compute_type = hipblas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // hipblasDatatype_t compute_type = hipblas_data_type; - // #else - // // TODO: currently use the hipblas_data_type - // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // hipblasDatatype_t compute_type = hipblas_data_type; - // #endif - // int num_requests = bc->num_active_requests(); - int num_tokens = bc->num_active_infr_tokens(); + + int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; int q_block_size = m->qProjSize; + int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int kt_req_block_size = kt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + int vt_req_block_size = vt_block_size * m->num_q_heads * + (BatchConfig::max_sequence_length() + + BatchConfig::max_spec_tree_token_num()); assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { + if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase) || + (bc->requestsInfo[i].num_tokens_in_batch == 0)) { + continue; + } else if (tokens_previous_requests < bc->num_generation_tokens) { + tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch; continue; } - for (int sub_req_id = 0; sub_req_id < bc->sub_requests[i]; sub_req_id++) { - - // int num_new_tokens = bc->num_processing_tokens[i]; - // int total_tokens = bc->token_last_available_idx[i] + 1; - - int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + - bc->requestsInfo[i].num_tokens_in_batch; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - tokens_previous_requests * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * kt_req_block_size; - - // if (i == 0 && sub_req_id == 0 && - // bc->beam_slots.at(0).current_depth == 1) { - // int offset = (float *)B - m->keyCache; - // printf("key cache offset %d\n", kt_req_block_size); - // } - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods) + - m->num_q_heads * tokens_prev_requests_squares; - - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - if (*m->position_bias) { - size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), - GET_BLOCKS(parallelism), - min((size_t)CUDA_NUM_THREADS, parallelism), - 0, - stream, - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens); - if (num_new_tokens > 1) { - size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(spec_fill_entries_above_diagonal
), - GET_BLOCKS(parallelism), - min((size_t)CUDA_NUM_THREADS, parallelism), - 0, - stream, - C, - num_new_tokens, - total_tokens, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(miopenSet4dTensorDescriptor( - m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax) + - m->num_q_heads * tokens_prev_requests_squares; - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = num_new_tokens; - n = m->vProjSize; - k = total_tokens; - lda = m_, ldb = n * m->num_q_heads, ldc = m_; - strideA = num_new_tokens * total_tokens; - strideB = vt_block_size; - strideC = num_new_tokens * m->vProjSize; - // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - A = C_softmax; - // To get B, skip over V^T entries from previous requests (all heads + - // padding) - B = static_cast
(m->valueCache) + - (i * bc->MAX_BEAM_WIDTH + sub_req_id) * vt_req_block_size; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - tokens_previous_requests * m->num_q_heads * m->vProjSize; - - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - // Project to output, save result directly on output tensor - alpha = 1.0f, beta = 0.0f; - m_ = m->oProjSize; - k = m->vProjSize * m->num_q_heads; - n = num_new_tokens; - lda = k, ldb = n, ldc = m_; - A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - B = C; - C = static_cast
(output_ptr) + - tokens_previous_requests * m->oProjSize; - - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - tokens_previous_requests += num_new_tokens; - tokens_prev_requests_squares += num_new_tokens * total_tokens; + // all requests in prompt phase should only have one sub requests; + assert(bc->sub_requests[i] == 1); + // int num_new_tokens = bc->num_processing_tokens[i]; + // int total_tokens = bc->token_last_available_idx[i] + 1; + + int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; + int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + + bc->requestsInfo[i].num_tokens_in_batch; + + if (num_new_tokens <= 0) { + continue; } - } - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * num_tokens; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - num_tokens, - qkv_weight_size, - m->oProjSize); + + // Compute (QK^T/sqrt(d_k)) + int m_ = num_new_tokens; + int n = total_tokens; + int k = m->qProjSize; + int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, + ldc = m_; + int strideA = q_block_size; + int strideB = kt_block_size; + int strideC = num_new_tokens * total_tokens; + + // a flag of using this scaling alpha + DT alpha = 1.0f, beta = 0.0f; + if (*m->qk_prod_scaling) { + alpha = static_cast
(1.0f / sqrt(m->kProjSize)); + } + // To get A, skip over Q entries from previous requests (same head) + DT const *A = static_cast
(m->devQKVProjArray) + + bc->requestsInfo[i].first_token_offset_in_batch * + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM; + DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; + DT *C = static_cast
(m->qk_prods); + + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_T, + HIPBLAS_OP_N, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + + if (*m->position_bias) { + size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + m->global_num_q_heads, + shard_id); + } + // Fill all elements above diagonal in qk prods with -inf to force + // causal attention. + assert(num_new_tokens <= total_tokens); + if (num_new_tokens > 1) { + size_t parallelism = m->num_q_heads * num_new_tokens * total_tokens; + hipLaunchKernelGGL(HIP_KERNEL_NAME(spec_fill_entries_above_diagonal
), + GET_BLOCKS(parallelism), + min((size_t)CUDA_NUM_THREADS, parallelism), + 0, + stream, + C, + num_new_tokens, + total_tokens, + m->num_q_heads, + static_cast
(-INFINITY)); + } + // Compute Softmax(QK^T/sqrt(d_k)) + // Before modifying the parameters below, make sure to read the following + // description of the CUDNN_TENSOR_NCHW tensor layout, from + // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: + // This tensor format specifies that the data is laid out in the following + // order: batch size, feature maps, rows, columns. The strides are + // implicitly defined in such a way that the data are contiguous in memory + // with no padding between images, feature maps, rows, and columns; the + // columns are the inner dimension and the images are the outermost + // dimension. + int n_param = m->num_q_heads; + int c_param = total_tokens; + int h_param = 1; + int w_param = num_new_tokens; + checkCUDNN(miopenSet4dTensorDescriptor( + m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); + float softmax_alpha = 1.0f, softmax_beta = 0.0f; + DT *C_softmax = static_cast
(m->qk_prods_softmax) + + m->num_q_heads * tokens_prev_requests_squares; + // The softmax operation below is executed according to the + // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The + // softmax operation is computed per spatial location (H,W) per image (N) + // across dimension C. + checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, + &softmax_alpha, + m->qk_tensor, + C, + &softmax_beta, + m->qk_tensor, + C_softmax, + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_CHANNEL)); + // Matmul softmax(QK^T/sqrt(d_k)) by V + alpha = 1.0f, beta = 0.0f; + m_ = m->vProjSize; + n = num_new_tokens; + k = total_tokens; + lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; + strideA = vt_block_size; + strideB = num_new_tokens * total_tokens; + strideC = m->vProjSize; + // To get A, skip over V^T entries from previous requests (all heads + + // padding) + A = static_cast
(m->valueCache) + i * vt_req_block_size; + // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous + // requests (all heads) + B = C_softmax; + // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous + // requests + + int token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + + C = static_cast
(m->attn_heads) + + (token_offset)*m->num_q_heads * m->vProjSize; + checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + m_, + n, + k, + &alpha, + A, + hipblas_data_type, + lda, + strideA, + B, + hipblas_data_type, + ldb, + strideB, + &beta, + C, + hipblas_data_type, + ldc, + strideC, + m->num_q_heads, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + + tokens_previous_requests += num_new_tokens; + tokens_prev_requests_squares += num_new_tokens * total_tokens; } - assert(tokens_previous_requests == num_tokens); + if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) { + bc->print(); + printf("tokens_previous_requests: %i\n", tokens_previous_requests); + printf("num_tokens: %i\n", num_tokens); + printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens); + } + assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens)); } template void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, int shard_id, - DT const *input_ptr, - DT const *weight_ptr, + DT const *qkv_ptr, DT *output_ptr, - DT const *bias_ptr, hipStream_t stream) { - // here because we need postion info in infernece 1 - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - checkCUDA( - hipMemcpyAsync(m->token_infos, - &(bc->tokensInfo), - max_tokens_per_batch * sizeof(BatchConfig::PerTokenInfo), - hipMemcpyHostToDevice, - stream)); - checkCUDA(hipMemcpyAsync(m->request_infos, - &(bc->requestsInfo), - bc->max_requests_per_batch() * - sizeof(BatchConfig::PerRequestInfo), - hipMemcpyHostToDevice, - stream)); - checkCUDA( - hipMemcpyAsync(m->beam_token_infos, - &(bc->beamTokenInfo), - max_tokens_per_batch * bc->MAX_BEAM_WIDTH * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo), - hipMemcpyHostToDevice, - stream)); - checkCUDA(hipMemcpyAsync( - m->beam_request_infos, - &(bc->beamRequestsInfo), - bc->max_requests_per_batch() * - sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo), - hipMemcpyHostToDevice, - stream)); + + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + hipMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * + sizeof(DT), // is this right, do we need layers etc here + hipMemcpyDeviceToDevice, + stream); // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + // TODO WARNING: this is commented out only because we are fixing the inc_attn + // first + compute_qkv_kernel( + m, bc, shard_id, static_cast
(m->devQKVProjArray), stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); - + if (bc->num_generation_tokens > 0) { + compute_spec_inc_attention_kernel_generation
( + m, bc, static_cast
(m->attn_heads), stream); + } // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 - compute_attention_kernel( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + if (bc->num_tokens > bc->num_generation_tokens) { + compute_attention_kernel_prompt(m, bc, shard_id, output_ptr, stream); + } + + int num_tokens = bc->num_active_tokens(); + + hipMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + hipMemcpyDeviceToDevice, + stream); } -} // namespace SpecIncMultiHeadAttention +} // namespace SpecIncMultiHeadSelfAttention } // namespace Kernels /*static*/ @@ -521,12 +751,9 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( BeamSearchBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -535,34 +762,14 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } - assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream); } else if (input.data_type == DT_FLOAT) { - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); - Kernels::SpecIncMultiHeadAttention::inference_kernel(m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( + m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream); } else { assert(false && "Unspported data type"); } @@ -581,7 +788,6 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, @@ -596,14 +802,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, + attn->rotary_embedding_meta, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->final_bias, attn->scaling_factor, - weight, gpu_mem_allocator, num_samples, attn->num_q_heads, @@ -618,43 +821,16 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( // allocate memory for the seqArray and reserve space { - int max_tokens_per_batch = BatchConfig::max_tokens_per_batch(); - size_t beam_tokeninfo_size = - max_tokens_per_batch * BeamSearchBatchConfig::MAX_BEAM_WIDTH; - size_t requestinfo_size = BeamSearchBatchConfig::max_requests_per_batch(); - size_t beam_requestinfo_size = - BeamSearchBatchConfig::max_requests_per_batch(); - size_t total_size = - requestinfo_size * sizeof(BatchConfig::PerRequestInfo) + - beam_tokeninfo_size * - sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo) + - beam_requestinfo_size * - sizeof(BeamSearchBatchConfig:: - BeamSearchPerRequestInfo); // more components will - // be added here later - - // We always directly allocate memory for small speculative models - gpu_mem_allocator.create_legion_instance(beam_search_reserve_inst, - total_size); beam_token_infos = - gpu_mem_allocator - .allocate_instance( - beam_tokeninfo_size); - // offset += beam_tokeninfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerTokenInfo); - request_infos = - gpu_mem_allocator.allocate_instance( - requestinfo_size); - // offset += requestinfo_size * sizeof(BatchConfig::PerRequestInfo); + static_cast( + handler.batch_config_metadata->beamTokenInfo); beam_request_infos = - gpu_mem_allocator - .allocate_instance( - beam_requestinfo_size); - // offset += beam_requestinfo_size * - // sizeof(BeamSearchBatchConfig::BeamSearchPerRequestInfo); - // assert(offset == total_size); - assert(gpu_mem_allocator.instance_total_size == - gpu_mem_allocator.instance_allocated_size); + static_cast( + handler.batch_config_metadata->beamRequestsInfo); + causalMask = static_cast( + handler.batch_config_metadata->causalMask); + request_completed = + static_cast(handler.batch_config_metadata->request_completed); } checkCUDA(hipStreamSynchronize(stream)); diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu index 4688a8233c..d8a2008388 100644 --- a/src/ops/spec_inc_multihead_self_attention.cu +++ b/src/ops/spec_inc_multihead_self_attention.cu @@ -463,8 +463,6 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, int shard_id, DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, cudaStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); @@ -472,23 +470,10 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); assert(data_type_size(m->output_type[0]) == sizeof(DT)); cudaDataType_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - // int num_requests = bc->num_active_requests(); + int num_tokens = bc->num_active_tokens(); int tokens_previous_requests = 0; int tokens_prev_requests_squares = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * num_tokens; int q_block_size = m->qProjSize; int kt_block_size = m->kProjSize; @@ -568,8 +553,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, m->num_q_heads, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // print_tensor((float*)C, 32, "C"); - // add alibi position bias to qk production + // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; @@ -698,21 +682,26 @@ template void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, BeamSearchBatchConfig const *bc, int shard_id, - DT const *input_ptr, - DT const *weight_ptr, + DT const *qkv_ptr, DT *output_ptr, - DT const *bias_ptr, cudaStream_t stream) { - // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * + sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens + // TODO WARNING: this is commented out only because we are fixing the inc_attn + // first + compute_qkv_kernel( + m, bc, shard_id, static_cast
(m->devQKVProjArray), stream); // phase 2: Update key/val cache update_kv_cache_kernel
(m, bc, stream); if (bc->num_generation_tokens > 0) { @@ -722,14 +711,16 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, // phase 3: Compute attention score // 3 kernels for pahse 3: matmul1 - softmax - matmal2 if (bc->num_tokens > bc->num_generation_tokens) { - compute_attention_kernel_prompt( - m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream); + compute_attention_kernel_prompt(m, bc, shard_id, output_ptr, stream); } - // compute output production and bias together for all tokens + int num_tokens = bc->num_active_tokens(); - compute_o_prod_bias( - m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream); + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); } } // namespace SpecIncMultiHeadSelfAttention @@ -741,12 +732,9 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( BeamSearchBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -755,36 +743,14 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( - m, - bc, - shard_id, - input.get_half_ptr(), - weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream); } else if (input.data_type == DT_FLOAT) { - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::SpecIncMultiHeadSelfAttention::inference_kernel( - m, - bc, - shard_id, - input.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream); } else { assert(false && "Unspported data type"); } @@ -797,16 +763,12 @@ void SpecIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventDestroy(t_start); cudaEventDestroy(t_end); printf("SpecIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); } } SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( FFHandler handler, SpecIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, @@ -821,14 +783,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, + attn->rotary_embedding_meta, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->final_bias, attn->scaling_factor, - weight, gpu_mem_allocator, num_samples, attn->num_q_heads, diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index 132a48be40..ae0795ac1e 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -61,12 +61,10 @@ Tensor FFModel::inc_multihead_self_attention_verify( int kdim, int vdim, float dropout, - bool qkv_bias, - bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, - bool apply_rotary_embedding, + RotaryEmbeddingMeta rotary_embedding_meta, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -79,12 +77,10 @@ Tensor FFModel::inc_multihead_self_attention_verify( kdim, vdim, dropout, - qkv_bias, - final_bias, add_zero_attn, data_type, kernel_initializer, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, @@ -100,12 +96,10 @@ Tensor FFModel::inc_multiquery_self_attention_verify( int kdim, int vdim, float dropout, - bool qkv_bias, - bool final_bias, bool add_zero_attn, DataType data_type, Initializer *kernel_initializer, - bool apply_rotary_embedding, + RotaryEmbeddingMeta rotary_embedding_meta, bool scaling_query, float scaling_factor, bool qk_prod_scaling, @@ -117,7 +111,6 @@ Tensor FFModel::inc_multiquery_self_attention_verify( DataType quantization_type = cpu_offload ? config.quantization_type : DT_NONE; bool offload = cpu_offload; Layer *li = nullptr; - int weight_num = (qkv_bias || final_bias) ? 2 : 1; if (data_type != input->data_type) { Tensor casted_input = cast(input, data_type, "type cast for IncMHA"); li = new Layer(this, @@ -125,7 +118,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0, 1 /*outputs*/, casted_input); } else { @@ -134,7 +127,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( data_type, name, 1 /*inputs*/, - weight_num /*weights*/, + 0, 1 /*outputs*/, input); } @@ -148,62 +141,28 @@ Tensor FFModel::inc_multiquery_self_attention_verify( li->outputs[0] = create_tensor_legion_ordering( numdims, dims, data_type, li, 0, true /*create_grad*/); } - // Compute weight size - int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim, - oProjSize = embed_dim; - int qSize = input->dims[0], kSize = input->dims[0], vSize = input->dims[0]; - int qParas = qProjSize * qSize; - int kParas = kProjSize * kSize; - int vParas = vProjSize * vSize; - int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize); - int one_head_size = qParas + kParas + vParas + oParas; - int weight_size = qParas * num_q_heads + kParas * num_q_heads + - vParas * num_q_heads + oParas * num_q_heads; - { - // compress the weight size if quantization. - if (quantization_type != DT_NONE) { - one_head_size = get_quantization_to_byte_size( - data_type, quantization_type, one_head_size); - } - int dims[1] = {weight_size}; - li->weights[0] = create_weight_legion_ordering( - 1, - dims, - quantization_type == DT_NONE ? data_type : quantization_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } - if (qkv_bias || final_bias) { - // q, k, v, o - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - int dims[1] = {(qkv_bias ? qkv_bias_size : 0) + - (final_bias ? oProjSize : 0)}; - li->weights[1] = create_weight_legion_ordering(1, - dims, - data_type, - li, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - } li->data_type = data_type; li->add_int_property("embed_dim", embed_dim); li->add_int_property("num_q_heads", num_q_heads); li->add_int_property("num_kv_heads", num_kv_heads); li->add_int_property("kdim", kdim); li->add_int_property("vdim", vdim); - li->add_int_property("qkv_bias", qkv_bias); - li->add_int_property("final_bias", final_bias); li->add_int_property("add_zero_attn", add_zero_attn); li->add_float_property("dropout", dropout); - li->add_int_property("apply_rotary_embedding", apply_rotary_embedding); + li->add_int_property("apply_rotary_embedding", + rotary_embedding_meta.apply_rotary_embedding); + li->add_float_property("rope_theta", rotary_embedding_meta.rope_theta); + li->add_string_property("rope_type", rotary_embedding_meta.rope_type); + li->add_float_property("factor", rotary_embedding_meta.factor); + li->add_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + li->add_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + li->add_int_property("original_max_position_embeddings", + rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); - li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); @@ -230,15 +189,20 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( int vdim = value; float dropout; layer->get_float_property("dropout", dropout); - layer->get_int_property("qkv_bias", value); - bool qkv_bias = (bool)value; - layer->get_int_property("final_bias", value); - bool final_bias = (bool)value; layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); - bool apply_rotary_embedding = (bool)value; - layer->get_int_property("scaling_query", value); + rotary_embedding_meta.apply_rotary_embedding = (bool)value; + layer->get_float_property("rope_theta", rotary_embedding_meta.rope_theta); + layer->get_string_property("rope_type", rotary_embedding_meta.rope_type); + layer->get_float_property("factor", rotary_embedding_meta.factor); + layer->get_float_property("low_freq_factor", + rotary_embedding_meta.low_freq_factor); + layer->get_float_property("high_freq_factor", + rotary_embedding_meta.high_freq_factor); + layer->get_int_property("original_max_position_embeddings", value); + rotary_embedding_meta.original_max_position_embeddings = (int)value; bool scaling_query = (bool)value; float scaling_factor; layer->get_float_property("scaling_factor", scaling_factor); @@ -261,15 +225,12 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( kdim, vdim, dropout, - qkv_bias, - final_bias, add_zero_attn, - apply_rotary_embedding, + rotary_embedding_meta, scaling_query, scaling_factor, qk_prod_scaling, position_bias, - false /*allocate_weights*/, quantization_type, offload, tensor_parallelism_degree, @@ -286,32 +247,27 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, char const *name) - // Initializer* _bias_initializer) : Op(model, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0, 1 /*outputs*/, _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), + rotary_embedding_meta(_rotary_embedding_meta), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), @@ -330,63 +286,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[i] = _input->dims[i]; } dims[0].size = _embed_dim; - // Currently require no parallelism along this dim - assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - // dims[2].size = qParas + kParas + vParas + oParas; - if (quantization_type != DT_NONE) { - dims[1].size = get_quantization_to_byte_size( - data_type, quantization_type, dims[1].size); - } - // dims[2].degree = 1; - // dims[2].parallel_idx = -1; - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>( - dims, - quantization_type == DT_NONE ? this->data_type : quantization_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } + // No longer require no parallelism along this dim + // assert(dims[0].degree == 1); outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ + /* // Check correctness */ /* assert(check_output_input_weight_parallel_dims()); */ } @@ -394,40 +299,33 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, const ParallelTensor _input, - const ParallelTensor _weight, int _embed_dim, int _num_q_heads, int _num_kv_heads, int _kdim, int _vdim, float _dropout, - bool _qkv_bias, - bool _final_bias, bool _add_zero_attn, - bool _apply_rotary_embedding, + RotaryEmbeddingMeta _rotary_embedding_meta, bool _scaling_query, float _scaling_factor, bool _qk_prod_scaling, bool _position_bias, - bool allocate_weights, DataType _quantization_type, bool _offload, int _tensor_parallelism_degree, char const *name) - // Initializer* _bias_initializer) : Op(model, OP_TREE_INC_MULTIHEAD_SELF_ATTENTION, _input->data_type, name, 1 /*inputs*/, - (_qkv_bias || _final_bias ? 2 : 1) /*weights*/, + 0, 1 /*outputs*/, - _input, - _weight), + _input), num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout), - qkv_bias(_qkv_bias), final_bias(_final_bias), add_zero_attn(_add_zero_attn), - apply_rotary_embedding(_apply_rotary_embedding), + rotary_embedding_meta(_rotary_embedding_meta), qSize(_input->dims[0].size), kSize(_input->dims[0].size), vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim), @@ -435,9 +333,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( scaling_query(_scaling_query), scaling_factor(_scaling_factor), qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias), quantization_type(_quantization_type), offload(_offload), - tensor_parallelism_degree(_tensor_parallelism_degree) -// bias_initializer(_bias_initializer) -{ + tensor_parallelism_degree(_tensor_parallelism_degree) { numOutputs = 1; int numdim = _input->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; @@ -445,64 +341,13 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( dims[i] = _input->dims[i]; } dims[0].size = _embed_dim; - // Currently require no parallelism along this dim + // Currently require no parallelism along this dim, is this aligned with the + // previous removal of assert? assert(dims[0].degree == 1); - if (allocate_weights) { - // Create weight tensor - int num_dims = inputs[0]->num_dims; - // Compute weight size - int qParas = this->qProjSize * this->qSize; - int kParas = this->kProjSize * this->kSize; - int vParas = this->vProjSize * this->vSize; - int oParas = - this->oProjSize * (this->vProjSize > 0 ? this->vProjSize : this->vSize); - ParallelDim dims[2]; - dims[0] = inputs[0]->dims[num_dims - 2]; - dims[0].size = dims[0].degree; - dims[1] = inputs[0]->dims[num_dims - 1]; - dims[1].size = this->num_q_heads * (qParas + oParas) + - this->num_q_heads * (kParas + vParas); - dims[1].is_replica_dim = false; - // dims[2].size = qParas + kParas + vParas + oParas; - if (quantization_type != DT_NONE) { - dims[1].size = get_quantization_to_byte_size( - data_type, quantization_type, dims[1].size); - } - int seed = std::rand(); - Initializer *initializer = new GlorotUniform(seed); - weights[0] = model.create_parallel_weight<2>( - dims, - quantization_type == DT_NONE ? this->data_type : quantization_type, - NULL /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - if (qkv_bias || final_bias) { - ParallelTensorShape bias_shape = _input->get_shape(); - int qkv_bias_size = - qProjSize * num_q_heads + (kProjSize + vProjSize) * num_q_heads; - bias_shape.dims[0].size = - (qkv_bias ? qkv_bias_size : 0) + (final_bias ? oProjSize : 0); - bias_shape.dims[1].size = bias_shape.dims[2].size = 1; - weights[1] = - model.create_parallel_weight_legion_ordering(bias_shape.num_dims, - bias_shape.dims, - this->data_type, - nullptr /*owner_op*/, - true /*create_grad*/, - initializer, - CHOSEN_SYNC_TYPE); - } - } outputs[0] = model.create_parallel_tensor_legion_ordering( _input->num_dims, dims, this->data_type, this); - /* for (int i = 0; i < numdim; i++) { */ - /* register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */ - /* } */ - /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1); */ - /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2); */ // Check correctness /* assert(check_output_input_weight_parallel_dims()); */ } @@ -510,8 +355,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, TreeIncMultiHeadSelfAttention const &other, - const ParallelTensor input, - bool allocate_weights) + const ParallelTensor input) : TreeIncMultiHeadSelfAttention(model, other.layer_guid, input, @@ -521,15 +365,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( other.qProjSize, other.vProjSize, other.dropout, - other.qkv_bias, - other.final_bias, other.add_zero_attn, - other.apply_rotary_embedding, + other.rotary_embedding_meta, other.scaling_query, other.scaling_factor, other.qk_prod_scaling, other.position_bias, - allocate_weights, other.quantization_type, other.offload, other.tensor_parallelism_degree, @@ -539,7 +380,6 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( FFModel &model, TreeIncMultiHeadSelfAttentionParams const ¶ms, ParallelTensor const &input, - bool allocate_weights, char const *name) : TreeIncMultiHeadSelfAttention(model, params.layer_guid, @@ -550,15 +390,12 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention( params.kdim, params.vdim, params.dropout, - params.qkv_bias, - params.final_bias, params.add_zero_attn, - params.apply_rotary_embedding, + params.rotary_embedding_meta, params.scaling_query, params.scaling_factor, params.qk_prod_scaling, params.position_bias, - allocate_weights, params.quantization_type, params.offload, params.tensor_parallelism_degree, @@ -592,20 +429,12 @@ void TreeIncMultiHeadSelfAttention::init_inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]); @@ -633,18 +462,12 @@ void TreeIncMultiHeadSelfAttention::init(FFModel const &ff) { EXCLUSIVE, inputs[0]->region)); launcher.add_field(0, FID_DATA); - launcher.add_region_requirement(RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region)); - launcher.add_field(1, FID_DATA); launcher.add_region_requirement(RegionRequirement(outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, outputs[0]->region)); - launcher.add_field(2, FID_DATA); + launcher.add_field(1, FID_DATA); FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); set_opmeta_from_futuremap(ff, fm); @@ -652,8 +475,7 @@ void TreeIncMultiHeadSelfAttention::init(FFModel const &ff) { /* regions[0](I): input - regions[1](I): weight - regions[2](O): output + regions[1](O): output */ OpMeta *TreeIncMultiHeadSelfAttention::init_task( Task const *task, @@ -671,17 +493,10 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = - helperGetGenericTensorAccessorRO(attn->weights[0]->data_type, - regions[1], - task->regions[1], - FID_DATA, - ctx, - runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(attn->outputs[0]->data_type, - regions[2], - task->regions[2], + regions[1], + task->regions[1], FID_DATA, ctx, runtime); @@ -689,14 +504,12 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1; assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1); - // int num_q_heads = weight.domain.hi()[1] - weight.domain.lo()[1] + 1; + int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree; int num_kv_heads = attn->num_kv_heads / attn->tensor_parallelism_degree + (attn->num_kv_heads % attn->tensor_parallelism_degree != 0); - assert(attn->oProjSize == output.domain.hi()[0] - output.domain.lo()[0] + 1); - Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); MemoryAllocator gpu_mem_allocator(gpu_mem); if (attn->offload) { @@ -705,14 +518,8 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( gpu_mem_allocator.register_reserved_work_space( handle.offload_reserve_space, handle.offload_reserve_space_size); } - TreeIncMultiHeadSelfAttentionMeta *m = - new TreeIncMultiHeadSelfAttentionMeta(handle, - attn, - weight, - gpu_mem_allocator, - num_samples, - num_q_heads, - num_kv_heads); + TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta( + handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads); if (!attn->offload) { // assert that we didn't over allocate memory assert(gpu_mem_allocator.reserved_allocated_size == @@ -723,10 +530,6 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task( std::strcpy(m->op_name, attn->name); m->layer_guid = attn->layer_guid; - if (attn->quantization_type == DT_NONE) { - assert(weight.domain.get_volume() * data_type_size(weight.data_type) == - m->weightSize); - } return m; } @@ -764,37 +567,18 @@ FutureMap TreeIncMultiHeadSelfAttention::inference( EXCLUSIVE, batch_inputs[0]->region)); launcher.add_field(idx++, FID_DATA); - launcher.add_region_requirement( - RegionRequirement(weights[0]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[0]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part, 0 /*projection id*/, WRITE_ONLY, EXCLUSIVE, batch_outputs[0]->region)); launcher.add_field(idx++, FID_DATA); - if (qkv_bias || final_bias) { - launcher.add_region_requirement( - RegionRequirement(weights[1]->part, - 0 /*projection id*/, - READ_ONLY, - EXCLUSIVE, - weights[1]->region, - ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0)); - launcher.add_field(idx++, FID_DATA); - } return runtime->execute_index_space(ctx, launcher); } /* regions[0](I): input - regions[3](I): weight - regions[4](O): output + regions[1](O): output */ void TreeIncMultiHeadSelfAttention::inference_task( Task const *task, @@ -815,37 +599,19 @@ void TreeIncMultiHeadSelfAttention::inference_task( TreeIncMultiHeadSelfAttentionMeta *m = *((TreeIncMultiHeadSelfAttentionMeta **)task->local_args); - assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4 - : regions.size() == 3)); + assert(regions.size() == 2); GenericTensorAccessorR input = helperGetGenericTensorAccessorRO( m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); - GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO( - m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorWO( - m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime); - GenericTensorAccessorR biases; - if (*m->qkv_bias || *m->final_bias) { - biases = helperGetGenericTensorAccessorRO(m->weight_type[1], - regions[3], - task->regions[3], - FID_DATA, - ctx, - runtime); - Domain bias_domain = runtime->get_index_space_domain( - ctx, task->regions[3].region.get_index_space()); - assert(bias_domain.get_dim() == 4); - } + m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); Domain input_domain = runtime->get_index_space_domain( ctx, task->regions[0].region.get_index_space()); - Domain weight_domain = runtime->get_index_space_domain( - ctx, task->regions[1].region.get_index_space()); Domain output_domain = runtime->get_index_space_domain( - ctx, task->regions[2].region.get_index_space()); + ctx, task->regions[1].region.get_index_space()); assert(input_domain.get_dim() == 4); - assert(weight_domain.get_dim() == 2); assert(output_domain.get_dim() == 4); /* print_tensor(input.get_float_ptr(), @@ -855,18 +621,13 @@ void TreeIncMultiHeadSelfAttention::inference_task( assert(task->index_point.get_dim() == 1); TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( - m, &bc, task->index_point.point_data[0], input, weight, output, biases); + m, &bc, task->index_point.point_data[0], input, output); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - std::vector weights_accessors; - weights_accessors.push_back(weight); - if (*m->qkv_bias || *m->final_bias) { - weights_accessors.push_back(biases); - } TreeIncMultiHeadSelfAttention::save_inference_tensors_to_file( - m, shard_id, &bc, {input}, weights_accessors, {output}); + m, shard_id, &bc, {input}, {}, {output}); } } @@ -896,9 +657,20 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs, return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim && lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim && lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout && - lhs.qkv_bias == rhs.qkv_bias && lhs.final_bias == rhs.final_bias && lhs.add_zero_attn == rhs.add_zero_attn && - lhs.apply_rotary_embedding == rhs.apply_rotary_embedding && + lhs.rotary_embedding_meta.apply_rotary_embedding == + rhs.rotary_embedding_meta.apply_rotary_embedding && + lhs.rotary_embedding_meta.rope_theta == + rhs.rotary_embedding_meta.rope_theta && + lhs.rotary_embedding_meta.rope_type == + rhs.rotary_embedding_meta.rope_type && + lhs.rotary_embedding_meta.factor == rhs.rotary_embedding_meta.factor && + lhs.rotary_embedding_meta.low_freq_factor == + rhs.rotary_embedding_meta.low_freq_factor && + lhs.rotary_embedding_meta.high_freq_factor == + rhs.rotary_embedding_meta.high_freq_factor && + lhs.rotary_embedding_meta.original_max_position_embeddings == + rhs.rotary_embedding_meta.original_max_position_embeddings && lhs.scaling_query == rhs.scaling_query && lhs.scaling_factor == rhs.scaling_factor && lhs.qk_prod_scaling == rhs.qk_prod_scaling && @@ -915,10 +687,8 @@ TreeIncMultiHeadSelfAttentionParams params.kdim = this->kProjSize; params.vdim = this->vProjSize; params.dropout = this->dropout; - params.qkv_bias = this->qkv_bias; - params.final_bias = this->final_bias; params.add_zero_attn = this->add_zero_attn; - params.apply_rotary_embedding = this->apply_rotary_embedding; + params.rotary_embedding_meta = this->rotary_embedding_meta; params.scaling_query = this->scaling_query; params.scaling_factor = this->scaling_factor; params.qk_prod_scaling = this->qk_prod_scaling; @@ -943,10 +713,15 @@ size_t hash::operator()( hash_combine(key, params.kdim); hash_combine(key, params.vdim); hash_combine(key, params.dropout); - hash_combine(key, params.qkv_bias); - hash_combine(key, params.final_bias); hash_combine(key, params.add_zero_attn); - hash_combine(key, params.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.apply_rotary_embedding); + hash_combine(key, params.rotary_embedding_meta.rope_theta); + hash_combine(key, params.rotary_embedding_meta.rope_type); + hash_combine(key, params.rotary_embedding_meta.factor); + hash_combine(key, params.rotary_embedding_meta.low_freq_factor); + hash_combine(key, params.rotary_embedding_meta.high_freq_factor); + hash_combine(key, + params.rotary_embedding_meta.original_max_position_embeddings); hash_combine(key, params.scaling_query); hash_combine(key, params.scaling_factor); hash_combine(key, params.qk_prod_scaling); diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 890d32bc87..50e2311ca8 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -17,7 +17,6 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h" #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh" -#include "flexflow/ops/tree_inc_multihead_self_attention.h" #include "flexflow/utils/hip_helper.h" #include #include @@ -519,300 +518,6 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix, } } -template -void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - hipStream_t stream) { - checkCUDA(hipblasSetStream(m->handle.blas, stream)); - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); - hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]); - miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); - hipblasDatatype_t compute_type = hipblas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // hipblasDatatype_t compute_type = hipblas_data_type; - // #else - // // TODO: currently use the hipblas_data_type - // // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // hipblasDatatype_t compute_type = hipblas_data_type; - // #endif - // int num_requests = bc->num_active_requests(); - int processed_tokens_in_batch = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); - int q_block_size = m->qProjSize; - int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(); - int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - assert(processed_tokens_in_batch == - bc->requestsInfo[i].first_token_offset_in_batch); - int last_token_idx_of_the_request = - processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; - while (processed_tokens_in_batch <= last_token_idx_of_the_request) { - int num_new_tokens = 1; - int j = processed_tokens_in_batch; - while ((j + 1 <= last_token_idx_of_the_request) && - (bc->tokensInfo[j].abs_depth_in_request + 1 == - bc->tokensInfo[j + 1].abs_depth_in_request)) { - j++; - num_new_tokens++; - } - - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; - assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); - { - // update K-V cache - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(update_tree_branch_kv_cache
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_new_tokens, // num_tokens_in_branch - processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_infr_tokens, // total_tokens_in_batch - BatchConfig::max_sequence_length(), - m->hidden_size); - } - - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens_in_request; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens_in_request; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - processed_tokens_in_batch * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods); - - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - if (*m->position_bias) { - size_t parallelism = - m->num_q_heads * total_tokens_in_request * num_new_tokens; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), - GET_BLOCKS(parallelism), - min((size_t)CUDA_NUM_THREADS, parallelism), - 0, - stream, - C, - num_new_tokens, - total_tokens_in_request, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens_in_request); - if (num_new_tokens > 1) { - size_t parallelism = - m->num_q_heads * num_new_tokens * total_tokens_in_request; - hipLaunchKernelGGL( - HIP_KERNEL_NAME(tree_fill_entries_above_diagonal
), - GET_BLOCKS(parallelism), - min((size_t)CUDA_NUM_THREADS, parallelism), - 0, - stream, - C, - num_new_tokens, - total_tokens_in_request, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens_in_request; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(miopenSet4dTensorDescriptor( - m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax, - MIOPEN_SOFTMAX_ACCURATE, - MIOPEN_SOFTMAX_MODE_CHANNEL)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens_in_request; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens_in_request; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - strideA, - B, - hipblas_data_type, - ldb, - strideB, - &beta, - C, - hipblas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - processed_tokens_in_batch += num_new_tokens; - } - // Before moving to the next request - // check that we have finished all tokens of the request - assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); - } - // Project to output, save result directly on output tensor - DT alpha = 1.0f, beta = 0.0f; - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = processed_tokens_in_batch; - int lda = k, ldb = k, ldc = m_; - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - DT const *B = static_cast
(m->attn_heads); - DT *C = static_cast
(output_ptr); - - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_T, - HIPBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - hipblas_data_type, - lda, - B, - hipblas_data_type, - ldb, - &beta, - C, - hipblas_data_type, - ldc, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * processed_tokens_in_batch; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w
), - GET_BLOCKS(parallelism), - min(CUDA_NUM_THREADS, parallelism), - 0, - stream, - output_ptr, - bias_ptr, - processed_tokens_in_batch, - qkv_weight_size, - m->oProjSize); - } - - assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); -} - #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_size_in_bytes_tree
(m->qProjSize, \ @@ -895,27 +600,10 @@ template void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, int shard_id, - DT const *input_ptr, - DT const *weight_ptr, + DT const *qkv_ptr, DT *output_ptr, - DT const *bias_ptr, hipStream_t stream) { - // additional processing for weight uploading - if (m->handle.offload_reserve_space != nullptr) { - // Note that we update weight_ptr and bias_ptr when uploading weight and - // bias - checkCUDA(hipMemcpyAsync(m->weight_ptr, - weight_ptr, - m->weightSize, - hipMemcpyHostToDevice, - stream)); - weight_ptr = static_cast
(m->weight_ptr); - if (m->biasSize > 0) { - checkCUDA(hipMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); - bias_ptr = static_cast
(m->bias_ptr); - } - } + // copy committed tokens info to GPU for the commit_tokens kernel // Note that m->num_active_infr_tokens stores the number of active // tokens in the previous batch, which is needed for committing @@ -929,39 +617,36 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // tokens for the current batch m->num_active_infr_tokens = bc->num_active_infr_tokens(); - // here because we need postion info in infernece 1 - if (m->offload && m->biasSize > 0) { - checkCUDA(hipMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream)); - bias_ptr = static_cast
(m->bias_ptr); - } + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + hipMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * + sizeof(DT), // is this right, do we need layers etc here + hipMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + // TODO WARNING: this is commented out only because we are fixing the inc_attn + // first + compute_qkv_kernel( + m, bc, shard_id, static_cast
(m->devQKVProjArray), stream); // phase 2: No need to update key/val cache - // IncMultiHeadSelfAttention::update_kv_cache_kernel( - // m, bc, stream); - // use the new kernel compute_attention_kernel_fused
( m, bc, static_cast
(m->attn_heads), stream); int processed_tokens_in_batch = bc->num_active_tokens(); - compute_o_prod_bias(m, - bc, - shard_id, - output_ptr, - weight_ptr, - bias_ptr, - processed_tokens_in_batch, - stream); + int num_tokens = bc->num_active_tokens(); + hipMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + hipMemcpyDeviceToDevice, + stream); } } // namespace TreeIncMultiHeadAttention @@ -973,12 +658,9 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeVerifyBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; hipEvent_t t_start, t_end; if (m->profiling) { @@ -987,44 +669,14 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventRecord(t_start, stream)); } - // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::TreeIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_half_ptr(), - m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream); } else if (input.data_type == DT_FLOAT) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::TreeIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_float_ptr(), - m->offload ? static_cast(m->weight_ptr) - : weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream); } else { assert(false && "Unspported data type"); } @@ -1037,16 +689,12 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( checkCUDA(hipEventDestroy(t_start)); checkCUDA(hipEventDestroy(t_end)); printf("TreeIncMultiHeadSelfAttention forward time = %.2fms\n", elapsed); - // print_tensor<3, float>(acc_query.ptr, acc_query.rect, - // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr, - // acc_output.rect, "[Attention:forward:output]"); } } TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, @@ -1061,14 +709,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, + attn->rotary_embedding_meta, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->final_bias, attn->scaling_factor, - weight, gpu_mem_allocator, num_samples, attn->num_q_heads, diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu index 86c53d7ea1..8c643b1964 100644 --- a/src/ops/tree_inc_multihead_self_attention.cu +++ b/src/ops/tree_inc_multihead_self_attention.cu @@ -494,303 +494,6 @@ __global__ void tree_fill_entries_above_diagonal(DT *matrix, } } -template -void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m, - TreeVerifyBatchConfig const *bc, - int shard_id, - DT *output_ptr, - DT const *bias_ptr, - DT const *weight_ptr, - cudaStream_t stream) { - checkCUDA(cublasSetStream(m->handle.blas, stream)); - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]); - cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]); - assert(data_type_size(m->output_type[0]) == sizeof(DT)); - cudaDataType_t compute_type = cublas_data_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = cublas_data_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif - // int num_requests = bc->num_active_requests(); - int processed_tokens_in_batch = 0; - // int qkv_block_size = - // (m->qProjSize + m->kProjSize + m->vProjSize) * bc->num_active_tokens(); - int q_block_size = m->qProjSize; - int kt_block_size = m->kProjSize; - int kt_req_block_size = - kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(); - int vt_block_size = m->vProjSize; - int vt_req_block_size = - vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() + - BatchConfig::max_spec_tree_token_num(); - assert(m->qProjSize == m->kProjSize); - - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - assert(processed_tokens_in_batch == - bc->requestsInfo[i].first_token_offset_in_batch); - int last_token_idx_of_the_request = - processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1; - while (processed_tokens_in_batch <= last_token_idx_of_the_request) { - int num_new_tokens = 1; - int j = processed_tokens_in_batch; - while ((j + 1 <= last_token_idx_of_the_request) && - (bc->tokensInfo[j].abs_depth_in_request + 1 == - bc->tokensInfo[j + 1].abs_depth_in_request)) { - j++; - num_new_tokens++; - } - - int total_tokens_in_request = bc->tokensInfo[j].abs_depth_in_request + 1; - assert(num_new_tokens >= 1 && total_tokens_in_request >= num_new_tokens); - { - // update K-V cache - int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_new_tokens; - update_tree_branch_kv_cache<<>>( - static_cast
(m->devQKVProjArray), - static_cast
(m->keyCache), - static_cast
(m->valueCache), - m->token_infos, - m->qProjSize, - m->kProjSize, - m->vProjSize, - num_new_tokens, // num_tokens_in_branch - processed_tokens_in_batch, // num_processed_tokens_in_batch - m->num_active_infr_tokens, // total_tokens_in_batch - BatchConfig::max_sequence_length(), - m->hidden_size); - } - - // bc->token_last_available_idx[i] + 1; - // Compute (QK^T/sqrt(d_k)) - int m_ = num_new_tokens; - int n = total_tokens_in_request; - int k = m->qProjSize; - int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads, - ldc = m_; - int strideA = q_block_size; - int strideB = kt_block_size; - int strideC = num_new_tokens * total_tokens_in_request; - - // a flag of using this scaling alpha - DT alpha = 1.0f, beta = 0.0f; - if (*m->qk_prod_scaling) { - alpha = static_cast
(1.0f / sqrt(m->kProjSize)); - } - // To get A, skip over Q entries from previous requests (same head) - DT const *A = static_cast
(m->devQKVProjArray) + - processed_tokens_in_batch * m->qProjSize * m->num_q_heads * - QKV_WEIGHT_NUM; - // To get B, skip over K entries from previous requests (all heads + - // padding) - DT const *B = static_cast
(m->keyCache) + i * kt_req_block_size; - // To get C, skip over QK^T products from previous requests - DT *C = static_cast
(m->qk_prods); - - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // add alibi position bias to qk production - // add alibi position bias to qk production - if (*m->position_bias) { - size_t parallelism = - m->num_q_heads * total_tokens_in_request * num_new_tokens; - apply_position_bias_qkprd<<>>(C, - num_new_tokens, - total_tokens_in_request, - m->num_q_heads, - m->global_num_q_heads, - shard_id); - } - - // Fill all elements above diagonal in qk prods with -inf to force - // causal attention. - assert(num_new_tokens <= total_tokens_in_request); - if (num_new_tokens > 1) { - size_t parallelism = - m->num_q_heads * num_new_tokens * total_tokens_in_request; - tree_fill_entries_above_diagonal<<>>( - C, - num_new_tokens, - total_tokens_in_request, - m->num_q_heads, - static_cast
(-INFINITY)); - } - // Compute Softmax(QK^T/sqrt(d_k)) - // Before modifying the parameters below, make sure to read the following - // description of the CUDNN_TENSOR_NCHW tensor layout, from - // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t: - // This tensor format specifies that the data is laid out in the following - // order: batch size, feature maps, rows, columns. The strides are - // implicitly defined in such a way that the data are contiguous in memory - // with no padding between images, feature maps, rows, and columns; the - // columns are the inner dimension and the images are the outermost - // dimension. - int n_param = m->num_q_heads; - int c_param = total_tokens_in_request; - int h_param = 1; - int w_param = num_new_tokens; - checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor, - CUDNN_TENSOR_NCHW, - cudnn_data_type, - n_param, - c_param, - h_param, - w_param)); - float softmax_alpha = 1.0f, softmax_beta = 0.0f; - DT *C_softmax = static_cast
(m->qk_prods_softmax); - // The softmax operation below is executed according to the - // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The - // softmax operation is computed per spatial location (H,W) per image (N) - // across dimension C. - checkCUDNN(cudnnSoftmaxForward(m->handle.dnn, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &softmax_alpha, - m->qk_tensor, - C, - &softmax_beta, - m->qk_tensor, - C_softmax)); - // Matmul softmax(QK^T/sqrt(d_k)) by V - alpha = 1.0f, beta = 0.0f; - m_ = m->vProjSize; - n = num_new_tokens; - k = total_tokens_in_request; - lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads; - strideA = vt_block_size; - strideB = num_new_tokens * total_tokens_in_request; - strideC = m->vProjSize; - // To get A, skip over V^T entries from previous requests (all heads + - // padding) - A = static_cast
(m->valueCache) + i * vt_req_block_size; - // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous - // requests (all heads) - B = C_softmax; - // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous - // requests - C = static_cast
(m->attn_heads) + - processed_tokens_in_batch * m->num_q_heads * m->vProjSize; - checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - strideA, - B, - cublas_data_type, - ldb, - strideB, - &beta, - C, - cublas_data_type, - ldc, - strideC, - m->num_q_heads, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - processed_tokens_in_batch += num_new_tokens; - } - // Before moving to the next request - // check that we have finished all tokens of the request - assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch); - } - // Project to output, save result directly on output tensor - DT alpha = 1.0f, beta = 0.0f; - int m_ = m->oProjSize; - int k = m->vProjSize * m->num_q_heads; - int n = processed_tokens_in_batch; - int lda = k, ldb = k, ldc = m_; - DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads + - m->kProjSize * m->num_q_heads + - m->vProjSize * m->num_q_heads); - DT const *B = static_cast
(m->attn_heads); - DT *C = static_cast
(output_ptr); - - checkCUDA(cublasGemmEx(m->handle.blas, - CUBLAS_OP_T, - CUBLAS_OP_N, - m_, - n, - k, - &alpha, - A, - cublas_data_type, - lda, - B, - cublas_data_type, - ldb, - &beta, - C, - cublas_data_type, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - if (*m->final_bias && shard_id == 0) { - int parallelism = m->oProjSize * processed_tokens_in_batch; - int qkv_weight_size = m->qProjSize * m->global_num_q_heads + - m->kProjSize * m->global_num_q_heads + - m->vProjSize * m->global_num_q_heads; - apply_proj_bias_w<<>>(output_ptr, - bias_ptr, - processed_tokens_in_batch, - qkv_weight_size, - m->oProjSize); - } - - assert(processed_tokens_in_batch == bc->num_active_infr_tokens()); -} - #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL( \ DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \ smem_size_in_bytes_tree
(m->qProjSize, \ @@ -873,27 +576,9 @@ template void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, TreeVerifyBatchConfig const *bc, int shard_id, - DT const *input_ptr, - DT const *weight_ptr, + DT const *qkv_ptr, DT *output_ptr, - DT const *bias_ptr, cudaStream_t stream) { - // additional processing for weight uploading - if (m->handle.offload_reserve_space != nullptr) { - // Note that we update weight_ptr and bias_ptr when uploading weight and - // bias - cudaMemcpyAsync(m->weight_ptr, - weight_ptr, - m->weightSize, - cudaMemcpyHostToDevice, - stream); - weight_ptr = static_cast
(m->weight_ptr); - if (m->biasSize > 0) { - cudaMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); - bias_ptr = static_cast
(m->bias_ptr); - } - } // copy committed tokens info to GPU for the commit_tokens kernel // Note that m->num_active_infr_tokens stores the number of active @@ -908,39 +593,36 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, // tokens for the current batch m->num_active_infr_tokens = bc->num_active_infr_tokens(); - // here because we need postion info in infernece 1 - if (m->offload && m->biasSize > 0) { - cudaMemcpyAsync( - m->bias_ptr, bias_ptr, m->biasSize, cudaMemcpyHostToDevice, stream); - bias_ptr = static_cast
(m->bias_ptr); - } + // phase 0: copy calculated qkv into devQKVProjArray + // [qProjSize, num_heads, 3, num_new_tokens] + size_t qkv_proj_size = + m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); + + cudaMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * + sizeof(DT), // is this right, do we need layers etc here + cudaMemcpyDeviceToDevice, + stream); + // phase 1: Implement kernel to compute KQV for input tokens - compute_qkv_kernel(m, - bc, - shard_id, - input_ptr, - weight_ptr, - static_cast
(m->devQKVProjArray), - bias_ptr, - stream); + // TODO WARNING: this is commented out only because we are fixing the inc_attn + // first + compute_qkv_kernel( + m, bc, shard_id, static_cast
(m->devQKVProjArray), stream); // phase 2: No need to update key/val cache - // IncMultiHeadSelfAttention::update_kv_cache_kernel( - // m, bc, stream); - // use the new kernel compute_attention_kernel_fused
( m, bc, static_cast
(m->attn_heads), stream); int processed_tokens_in_batch = bc->num_active_tokens(); - compute_o_prod_bias(m, - bc, - shard_id, - output_ptr, - weight_ptr, - bias_ptr, - processed_tokens_in_batch, - stream); + int num_tokens = bc->num_active_tokens(); + cudaMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + cudaMemcpyDeviceToDevice, + stream); } } // namespace TreeIncMultiHeadAttention @@ -952,12 +634,9 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeVerifyBatchConfig const *bc, int shard_id, GenericTensorAccessorR const &input, - GenericTensorAccessorR const &weight, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &bias) { + GenericTensorAccessorW const &output) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - bool use_bias = *m->qkv_bias || *m->final_bias; cudaEvent_t t_start, t_end; if (m->profiling) { @@ -966,44 +645,14 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( cudaEventRecord(t_start, stream); } - // assert(input.data_type == weight.data_type); assert(input.data_type == output.data_type); - if (use_bias) { - assert(input.data_type == bias.data_type); - } if (input.data_type == DT_HALF) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - - half const *bias_ptr = - use_bias ? bias.get_half_ptr() : static_cast(nullptr); Kernels::TreeIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_half_ptr(), - m->offload ? static_cast(m->weight_ptr) : weight.get_half_ptr(), - output.get_half_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream); } else if (input.data_type == DT_FLOAT) { - if (m->offload) { - pre_build_weight_kernel(m, weight, input.data_type, stream); - } - float const *bias_ptr = - use_bias ? bias.get_float_ptr() : static_cast(nullptr); Kernels::TreeIncMultiHeadAttention::inference_kernel( - m, - bc, - shard_id, - input.get_float_ptr(), - m->offload ? static_cast(m->weight_ptr) - : weight.get_float_ptr(), - output.get_float_ptr(), - bias_ptr, - stream); + m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream); } else { assert(false && "Unspported data type"); } @@ -1021,7 +670,6 @@ void TreeIncMultiHeadSelfAttention::inference_kernel_wrapper( TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( FFHandler handler, TreeIncMultiHeadSelfAttention const *attn, - GenericTensorAccessorR const &weight, MemoryAllocator &gpu_mem_allocator, int num_samples, int _num_q_heads, @@ -1036,14 +684,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta( attn->kProjSize, attn->vProjSize, attn->oProjSize, - attn->apply_rotary_embedding, - attn->qkv_bias, + attn->rotary_embedding_meta, attn->scaling_query, attn->qk_prod_scaling, attn->position_bias, - attn->final_bias, attn->scaling_factor, - weight, gpu_mem_allocator, num_samples, attn->num_q_heads, diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index dc43d80133..a4443c4066 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -73,7 +73,7 @@ AllReduce::AllReduce(FFModel &model, for (int i = 0; i < numdim; i++) { dims[i] = _input->dims[i]; } - assert(dims[allreduce_dim].degree > 1); + // assert(dims[allreduce_dim].degree > 1); // ParallelTensorBase::update_parallel_ids(numdim, dims); outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, _input->data_type, this); diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index c373e0da9b..e73893475c 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -80,51 +80,56 @@ std::string removeGuidOperatorName(std::string const &input) { } template -void load_attention_weights_multi_query(DT *ptr, - std::string layer_name, - std::string weights_folder, - size_t hidden_dim, - int num_heads) { - - std::string qkv_file = layer_name.substr(0, layer_name.find("attention")) + - "attention_query_key_value_weight"; - std::string o_file = layer_name.substr(0, layer_name.find("attention")) + - "attention_dense_weight"; +void load_attention_o_proj_bias_to_dense_v2(DT *ptr, + int num_heads, + int num_kv_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weights_folder) { + std::string filename = layer_name + ".o_proj.bias"; - // q has n_heads heads, k and v only have one head, o have n_head heads - std::vector weight_filenames = {qkv_file, o_file}; int file_index = 0; - int data_index = 0; - for (auto filename : weight_filenames) { - std::cout << "Loading weight file " << filename << std::endl; - std::string weight_filepath = join_path({weights_folder, filename}); - size_t partial_size = - file_index == 0 ? (hidden_dim + 2 * hidden_dim / num_heads) * hidden_dim - : hidden_dim * hidden_dim; - std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); - // std::cout << "Loading filename: " << weight_filepath << std::endl; - if (!in.good()) { - std::cout << "Could not open file: " << weight_filepath << std::endl; - } - assert(in.good() && "incorrect weight file path"); - std::vector
host_array(partial_size); - size_t loaded_data_size = sizeof(DT) * partial_size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - in.read((char *)host_array.data(), loaded_data_size); - size_t in_get_size = in.gcount(); + // now only opt use this. + // assert(num_heads == num_kv_heads); + int idx = 0; - if (in_get_size != loaded_data_size) { - std::cout << "load data error " << in_get_size << ", " - << loaded_data_size; - assert(false && "data size mismatch"); - } - for (int i = 0; i < partial_size; i++) { - ptr[data_index++] = host_array.at(i); - } - file_index++; + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + + int n_heads = num_heads; + + int replicate_num = num_heads / num_kv_heads; + + size_t out_partial_size = hidden_dim; + size_t partial_size = out_partial_size; + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + assert(in.good() && "incorrect bias file path"); + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); + + if (in_get_size != loaded_data_size) { + printf( + "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n", + in_get_size, + loaded_data_size); + assert(false); } + assert(partial_size == host_array.size()); + + size_t data_index = 0; + + for (int i = 0; i < partial_size; i++) { + ptr[i] = host_array.at(data_index); + data_index++; + } + + in.close(); } template @@ -135,44 +140,53 @@ void load_attention_bias_v2(DT *ptr, size_t qkv_inner_dim, bool final_bias, std::string layer_name, - std::string weights_folder) { + std::string weights_folder, + int tp_degree) { std::string q_file = layer_name + ".q_proj.bias"; std::string k_file = layer_name + ".k_proj.bias"; std::string v_file = layer_name + ".v_proj.bias"; std::vector bias_files = {q_file, k_file, v_file}; - if (final_bias) { - std::string o_file = layer_name + ".o_proj.bias"; - bias_files.push_back(o_file); - } - int file_index = 0; - - // now only opt use this. - // assert(num_heads == num_kv_heads); - int idx = 0; + // linear layer weights: [output_size, input_size] + // bias layer weights: [output_size] + // Q,K,V projection weights: [head_dim*num_heads, hidden_size] = [768, 768] + // QKV bias weights: [head_dim*num_heads] = [768], organized as: [head_dim_0, + // head_dim_1, ...] + + // need to rearrange: [[q_heads_shard_0], [k_heads_shard_0], + // [v_heads_shard_0], ..., [q_heads_shard_n], [k_heads_shard_n], + // [v_heads_shard_n]] where n = tp_degree + assert(num_heads % tp_degree == 0); + assert(num_kv_heads % tp_degree == 0); + assert(hidden_dim % num_heads == 0); + assert(qkv_inner_dim == hidden_dim / num_heads); + size_t q_heads_per_shard = num_heads / tp_degree; + size_t kv_heads_per_shard = num_kv_heads / tp_degree; + size_t shard_chunk_size = + (q_heads_per_shard + 2 * kv_heads_per_shard) * qkv_inner_dim; + int file_index = 0; for (auto filename : bias_files) { std::cout << "Loading weight file " << filename << std::endl; std::string weight_filepath = join_path({weights_folder, filename}); int n_heads = file_index == 0 ? num_heads : num_kv_heads; - - int replicate_num = num_heads / num_kv_heads; - - size_t qkv_partial_size = qkv_inner_dim * n_heads; - size_t qkv_replicate_size = qkv_inner_dim * num_heads; - size_t out_partial_size = hidden_dim; - size_t partial_size = - (file_index < 3) ? qkv_partial_size : out_partial_size; + assert(n_heads % tp_degree == 0); + int heads_per_shard = n_heads / tp_degree; + int qkv_prev_heads_cur_shard = + (file_index == 2) ? num_heads + num_kv_heads : file_index * num_heads; + assert(qkv_prev_heads_cur_shard % tp_degree == 0); + qkv_prev_heads_cur_shard /= tp_degree; + + // load into memory first + size_t bias_size = qkv_inner_dim * n_heads; std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); assert(in.good() && "incorrect bias file path"); - std::vector
host_array(partial_size); - size_t loaded_data_size = sizeof(DT) * partial_size; - in.seekg(0, in.end); + std::vector
host_array(bias_size); + size_t loaded_data_size = sizeof(DT) * bias_size; in.seekg(0, in.beg); in.read((char *)host_array.data(), loaded_data_size); size_t in_get_size = in.gcount(); - if (in_get_size != loaded_data_size) { printf( "load bias data error: in_get_size (%lu) != loaded_data_size (%lu)\n", @@ -180,43 +194,37 @@ void load_attention_bias_v2(DT *ptr, loaded_data_size); assert(false); } - assert(partial_size == host_array.size()); - - size_t data_index = 0; - - // q, o - if (file_index == 0 || file_index == 3) { - for (int i = 0; i < partial_size; i++) { - ptr[idx + i] = host_array.at(data_index); - data_index++; - } - } else { - // k, v - for (int i = 0; i < partial_size; i++) { - for (int j = 0; j < replicate_num; j++) { - ptr[idx + j * partial_size + i] = host_array.at(data_index); - } - data_index++; + assert(bias_size == host_array.size()); + + // now copy chunks into ptr + for (int i = 0; i < n_heads; i++) { + int shard_idx = i / heads_per_shard; + for (int j = 0; j < qkv_inner_dim; j++) { + int src_idx = i * qkv_inner_dim + j; + int dst_idx = shard_idx * shard_chunk_size + + qkv_prev_heads_cur_shard * qkv_inner_dim + + (i % heads_per_shard) * qkv_inner_dim + j; + ptr[dst_idx] = host_array.at(src_idx); } } - file_index++; - idx += qkv_replicate_size; - in.close(); } } template -void load_attention_weights_v2(DT *ptr, - int num_heads, - int num_kv_heads, - size_t hidden_dim, - size_t qkv_inner_dim, - std::string layer_name, - std::string weights_folder, - size_t volume, - int tensor_parallelism_degree) { +void load_attention_weights_to_dense_v2(DT *ptr, + int num_heads, + int num_kv_heads, + size_t hidden_dim, + size_t qkv_inner_dim, + std::string layer_name, + std::string weights_folder, + size_t volume, + int tensor_parallelism_degree, + bool load_o_proj) { + // layers_0_attention_wq_weight + // layers_0_self_attn_q_proj_weight std::string q_file = layer_name + ".q_proj.weight"; std::string k_file = layer_name + ".k_proj.weight"; std::string v_file = layer_name + ".v_proj.weight"; @@ -241,64 +249,64 @@ void load_attention_weights_v2(DT *ptr, int replicate_num = num_heads / num_kv_heads; // stride for q, k, v, o - size_t stride_size = (q_size + v_replicate_size + k_replicate_size + o_size) / + size_t stride_size = (q_size + v_replicate_size + k_replicate_size) / tensor_parallelism_degree; - for (auto filename : weight_filenames) { - std::cout << "Loading weight file " << filename << std::endl; - std::string weight_filepath = join_path({weights_folder, filename}); - - int data_index = 0; - size_t partial_size = (file_index == 0 || file_index == 3) - ? one_weight_file_size - : single_proj_size * num_kv_heads; - size_t one_partition_size = - one_weight_file_size / tensor_parallelism_degree; - - std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); - if (!in.good()) { - std::cout << "Could not open file: " << weight_filepath << std::endl; - } - assert(in.good() && "incorrect weight file path"); - std::vector
host_array(partial_size); - size_t loaded_data_size = sizeof(DT) * partial_size; - in.seekg(0, in.end); - in.seekg(0, in.beg); - in.read((char *)host_array.data(), loaded_data_size); - size_t in_get_size = in.gcount(); + if (!load_o_proj) { + for (auto filename : weight_filenames) { + std::cout << "Loading weight file " << filename << std::endl; + std::string weight_filepath = join_path({weights_folder, filename}); + + int data_index = 0; + size_t partial_size = (file_index == 0 || file_index == 3) + ? one_weight_file_size + : single_proj_size * num_kv_heads; + size_t one_partition_size = + one_weight_file_size / tensor_parallelism_degree; + + std::ifstream in(weight_filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + std::cout << "Could not open file: " << weight_filepath << std::endl; + } + assert(in.good() && "incorrect weight file path"); + std::vector
host_array(partial_size); + size_t loaded_data_size = sizeof(DT) * partial_size; + in.seekg(0, in.end); + in.seekg(0, in.beg); + in.read((char *)host_array.data(), loaded_data_size); + size_t in_get_size = in.gcount(); - if (in_get_size != loaded_data_size) { - std::cout << "load attention data error " << in_get_size << ", " - << loaded_data_size << ", " << file_index << ", " - << weight_filepath << "\n"; - assert(false && "data size mismatch"); - } - // wq, wk, wo - if (file_index == 0) { - for (int i = 0; i < tensor_parallelism_degree; i++) { - for (int j = 0; j < one_partition_size; j++) { - ptr[base_index + i * stride_size + j] = host_array.at(data_index++); - } + if (in_get_size != loaded_data_size) { + std::cout << "load attention data error " << in_get_size << ", " + << loaded_data_size << ", " << file_index << ", " + << weight_filepath << "\n"; + assert(false && "data size mismatch"); } - } else { - for (int i = 0; i < num_heads; i++) { - int kv_idx = i / (num_heads / num_kv_heads); - int head_idx = i % (num_heads / tensor_parallelism_degree); - int tp_idx = (i / (num_heads / tensor_parallelism_degree)); - for (int j = 0; j < single_proj_size; j++) { - ptr[base_index + tp_idx * stride_size + single_proj_size * head_idx + - j] = host_array.at(kv_idx * single_proj_size + j); + // wq, wk, wo + if (file_index == 0) { + for (int i = 0; i < tensor_parallelism_degree; i++) { + for (int j = 0; j < one_partition_size; j++) { + ptr[base_index + i * stride_size + j] = host_array.at(data_index++); + } + } + } else { + for (int i = 0; i < num_heads; i++) { + int kv_idx = i / (num_heads / num_kv_heads); + int head_idx = i % (num_heads / tensor_parallelism_degree); + int tp_idx = (i / (num_heads / tensor_parallelism_degree)); + for (int j = 0; j < single_proj_size; j++) { + ptr[base_index + tp_idx * stride_size + + single_proj_size * head_idx + j] = + host_array.at(kv_idx * single_proj_size + j); + } } } + // std::cout << "host array going out of scope, releasing" << endl; + base_index += one_partition_size; + file_index++; } - - // assert(data_index == partial_size); - base_index += one_partition_size; - file_index++; - } - assert(base_index == (q_size + k_replicate_size + v_replicate_size) / - tensor_parallelism_degree); - - { + assert(base_index == (q_size + k_replicate_size + v_replicate_size) / + tensor_parallelism_degree); + } else { std::cout << "Loading weight file " << o_file << std::endl; std::string weight_filepath = join_path({weights_folder, o_file}); @@ -314,6 +322,15 @@ void load_attention_weights_v2(DT *ptr, in.read((char *)host_array.data(), loaded_data_size); size_t in_get_size = in.gcount(); + DT temp; + + for (int i = 0; i < one_weight_file_size; i++) { + temp = host_array.at(i); + } + + // std::cout<<"o_proj loaded into host array, total size: + // "<name)); + bool is_attn_proj = false, is_o_proj = false; + + // dense layers for attention projection is named as + // self_attn.qkv_proj or self_attn.o_proj + // so looking for self_attn. in the name can determine if it is an attention + // projection + if (weight_filename.find("attn.") != std::string::npos || + weight_filename.find("self_attention.") != std::string::npos) { + size_t pos = weight_filename.find(".o_proj"); + if (pos != std::string::npos) { + weight_filename.replace(pos, std::string(".o_proj").length(), ""); + is_o_proj = true; + } else { + pos = weight_filename.find(".qkv_proj"); + if (pos == std::string::npos) { + cout << weight_filename << endl; + } + assert(pos != std::string::npos); + weight_filename.replace(pos, std::string(".qkv_proj").length(), ""); + } + is_attn_proj = true; + } if (ff->config.benchmarking) { std::cout << "Initializing weight " << weight_filename @@ -730,28 +773,51 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION || l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) { - if (weight_idx == 0) { - load_attention_weights_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - weight_filename, - weights_folder, - volume, - tensor_parallelism_degree); + } else if (is_attn_proj) { + if (is_o_proj) { + if (weight_idx == 0) { + load_attention_weights_to_dense_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree, + true); + } else { + load_attention_o_proj_bias_to_dense_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder); + } } else { - long long value; - l->get_int_property("final_bias", value); - bool final_bias = (bool)value; - load_attention_bias_v2(data, - num_heads, - num_kv_heads, - hidden_dim, - qkv_inner_dim, - final_bias, - weight_filename, - weights_folder); + if (weight_idx == 0) { + load_attention_weights_to_dense_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + weight_filename, + weights_folder, + volume, + tensor_parallelism_degree, + false); + } else { + load_attention_bias_v2(data, + num_heads, + num_kv_heads, + hidden_dim, + qkv_inner_dim, + false, // do not load o_proj bias + weight_filename, + weights_folder, + tensor_parallelism_degree); + } } } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) { assert(weight_idx >= 0 || weight_idx <= 2); diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 1a38782e81..2bc64c1670 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2331,10 +2331,17 @@ GraphOptimalViewSerialized sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); - sez.serialize(attn->qkv_bias); - sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.rope_theta); + sez.serialize(attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(), + attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.factor); + sez.serialize(attn->rotary_embedding_meta.low_freq_factor); + sez.serialize(attn->rotary_embedding_meta.high_freq_factor); + sez.serialize( + attn->rotary_embedding_meta.original_max_position_embeddings); sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); @@ -2358,10 +2365,17 @@ GraphOptimalViewSerialized sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); - sez.serialize(attn->qkv_bias); - sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.rope_theta); + sez.serialize(attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(), + attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.factor); + sez.serialize(attn->rotary_embedding_meta.low_freq_factor); + sez.serialize(attn->rotary_embedding_meta.high_freq_factor); + sez.serialize( + attn->rotary_embedding_meta.original_max_position_embeddings); sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); @@ -2382,10 +2396,17 @@ GraphOptimalViewSerialized sez.serialize(attn->qProjSize); sez.serialize(attn->vProjSize); sez.serialize(attn->dropout); - sez.serialize(attn->qkv_bias); - sez.serialize(attn->final_bias); sez.serialize(attn->add_zero_attn); - sez.serialize(attn->apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.apply_rotary_embedding); + sez.serialize(attn->rotary_embedding_meta.rope_theta); + sez.serialize(attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.rope_type.c_str(), + attn->rotary_embedding_meta.rope_type.size()); + sez.serialize(attn->rotary_embedding_meta.factor); + sez.serialize(attn->rotary_embedding_meta.low_freq_factor); + sez.serialize(attn->rotary_embedding_meta.high_freq_factor); + sez.serialize( + attn->rotary_embedding_meta.original_max_position_embeddings); sez.serialize(attn->scaling_query); sez.serialize(attn->scaling_factor); sez.serialize(attn->qk_prod_scaling); @@ -2817,8 +2838,9 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, offload, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, offload, + position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); @@ -2830,10 +2852,18 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); - dez.deserialize(qkv_bias); - dez.deserialize(final_bias); dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.rope_theta); + size_t rope_type_len; + char rope_type[1024] = {0}; + dez.deserialize(rope_type_len); + dez.deserialize(rope_type, rope_type_len); + rotary_embedding_meta.rope_type = std::string(rope_type); + dez.deserialize(rotary_embedding_meta.factor); + dez.deserialize(rotary_embedding_meta.low_freq_factor); + dez.deserialize(rotary_embedding_meta.high_freq_factor); + dez.deserialize(rotary_embedding_meta.original_max_position_embeddings); dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); @@ -2853,11 +2883,9 @@ void FFModel::deserialize_graph_optimal_view( params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; - params.qkv_bias = qkv_bias; - params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; + params.rotary_embedding_meta = rotary_embedding_meta; params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; @@ -2874,8 +2902,8 @@ void FFModel::deserialize_graph_optimal_view( assert(num_inputs == 1); int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); dez.deserialize(transformer_layer_id); @@ -2886,10 +2914,18 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); - dez.deserialize(qkv_bias); - dez.deserialize(final_bias); dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.rope_theta); + size_t rope_type_len; + char rope_type[1024] = {0}; + dez.deserialize(rope_type_len); + dez.deserialize(rope_type, rope_type_len); + rotary_embedding_meta.rope_type = std::string(rope_type); + dez.deserialize(rotary_embedding_meta.factor); + dez.deserialize(rotary_embedding_meta.low_freq_factor); + dez.deserialize(rotary_embedding_meta.high_freq_factor); + dez.deserialize(rotary_embedding_meta.original_max_position_embeddings); dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); @@ -2906,11 +2942,9 @@ void FFModel::deserialize_graph_optimal_view( params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; - params.qkv_bias = qkv_bias; - params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; + params.rotary_embedding_meta = rotary_embedding_meta; params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; @@ -2926,8 +2960,9 @@ void FFModel::deserialize_graph_optimal_view( int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads, tensor_parallelism_degree; float dropout, scaling_factor; - bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding, - scaling_query, qk_prod_scaling, offload, position_bias; + bool add_zero_attn, scaling_query, qk_prod_scaling, offload, + position_bias; + RotaryEmbeddingMeta rotary_embedding_meta; DataType quantization_type; size_t id, transformer_layer_id, deserialized_model_id; dez.deserialize(id); @@ -2939,10 +2974,18 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(k_dim); dez.deserialize(v_dim); dez.deserialize(dropout); - dez.deserialize(qkv_bias); - dez.deserialize(final_bias); dez.deserialize(add_zero_attn); - dez.deserialize(apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.apply_rotary_embedding); + dez.deserialize(rotary_embedding_meta.rope_theta); + size_t rope_type_len; + char rope_type[1024] = {0}; + dez.deserialize(rope_type_len); + dez.deserialize(rope_type, rope_type_len); + rotary_embedding_meta.rope_type = std::string(rope_type); + dez.deserialize(rotary_embedding_meta.factor); + dez.deserialize(rotary_embedding_meta.low_freq_factor); + dez.deserialize(rotary_embedding_meta.high_freq_factor); + dez.deserialize(rotary_embedding_meta.original_max_position_embeddings); dez.deserialize(scaling_query); dez.deserialize(scaling_factor); dez.deserialize(qk_prod_scaling); @@ -2962,11 +3005,9 @@ void FFModel::deserialize_graph_optimal_view( params.kdim = k_dim; params.vdim = v_dim; params.dropout = dropout; - params.qkv_bias = qkv_bias; - params.final_bias = final_bias; params.add_zero_attn = add_zero_attn; params.layer_guid = layer_guid; - params.apply_rotary_embedding = apply_rotary_embedding; + params.rotary_embedding_meta = rotary_embedding_meta; params.scaling_query = scaling_query; params.scaling_factor = scaling_factor; params.qk_prod_scaling = qk_prod_scaling; diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 1b65dfd869..f39ea91f28 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -800,6 +800,7 @@ void FFModel::compile_inference() { false /*must*/, 0 /*mapper_id*/, view.hash() /*MappingTagID*/); + index_launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, index_launcher); fm.wait_all_results(); int idx = 0; diff --git a/src/runtime/layer.cc b/src/runtime/layer.cc index 8f33f6db87..72e71688c1 100644 --- a/src/runtime/layer.cc +++ b/src/runtime/layer.cc @@ -87,6 +87,11 @@ void Layer::add_int_vector_property(std::string const &key, int_vector_properties[key] = value; } +void Layer::add_string_property(std::string const &key, + std::string const &value) { + string_properties[key] = value; +} + void Layer::add_initializer(std::string const &key, Initializer *initializer) { initializers[key] = initializer; } @@ -125,6 +130,18 @@ bool Layer::get_int_vector_property(std::string const &key, } } +bool Layer::get_string_property(std::string const &key, + std::string &value) const { + auto const &it = string_properties.find(key); + if (it == string_properties.end()) { + assert(false); + return false; + } else { + value = it->second; + return true; + } +} + bool Layer::get_initializer(std::string const &key, Initializer *&initializer) const { auto const &it = initializers.find(key); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 52f1dd2220..69fe3b598d 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1156,16 +1156,25 @@ bool Op::check_output_input_weight_same_parallel_is() const { IndexSpace parallel_is = outputs[0]->parallel_is; for (int i = 0; i < numOutputs; i++) { if (outputs[i]->parallel_is != parallel_is) { + std::cout << "outputs[" << i << "] has different parallel_is " + << outputs[i]->parallel_is << " than output[0] " << parallel_is + << std::endl; return false; } } for (int i = 0; i < numInputs; i++) { if (inputs[i]->parallel_is != parallel_is) { + std::cout << "inputs[" << i << "] has different parallel_is " + << inputs[i]->parallel_is << " than output[0] " << parallel_is + << std::endl; return false; } } for (int i = 0; i < numWeights; i++) { if (weights[i]->parallel_is != parallel_is) { + std::cout << "weights[" << i << "] has different parallel_is " + << weights[i]->parallel_is << " than output[0] " << parallel_is + << std::endl; return false; } } @@ -3414,26 +3423,28 @@ bool FFModel::need_to_add_allreduce(int layer_idx) const { auto const &l = layers[layer_idx]; if (config.computationMode == COMP_MODE_INFERENCE && config.tensor_parallelism_degree > 1 && - (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - // mlp layer - is_mlp_block(layer_idx) || - // llama mlp layer - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_GELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || - // LLAMA without element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 5 && - layers[layer_idx - 1]->op_type == OP_EW_MUL && - layers[layer_idx - 2]->op_type == OP_EW_MUL && - layers[layer_idx - 3]->op_type == OP_SIGMOID && - layers[layer_idx - 4]->op_type == OP_LINEAR && - layers[layer_idx - 5]->op_type == OP_LINEAR) || - // LLAMA with element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 3 && - layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && - layers[layer_idx - 2]->op_type == OP_LINEAR && - layers[layer_idx - 3]->op_type == OP_LINEAR))) { + ( + // l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || + // l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || + (std::string(l->name).find("attn.o_proj") != std::string::npos) || + // mlp layer + is_mlp_block(layer_idx) || + // llama mlp layer + (l->op_type == OP_LINEAR && layer_idx >= 2 && + layers[layer_idx - 1]->op_type == OP_GELU && + layers[layer_idx - 2]->op_type == OP_LINEAR) || + // LLAMA without element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 5 && + layers[layer_idx - 1]->op_type == OP_EW_MUL && + layers[layer_idx - 2]->op_type == OP_EW_MUL && + layers[layer_idx - 3]->op_type == OP_SIGMOID && + layers[layer_idx - 4]->op_type == OP_LINEAR && + layers[layer_idx - 5]->op_type == OP_LINEAR) || + // LLAMA with element-wise operator fusion + (l->op_type == OP_LINEAR && layer_idx >= 3 && + layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && + layers[layer_idx - 2]->op_type == OP_LINEAR && + layers[layer_idx - 3]->op_type == OP_LINEAR))) { return true; } return false; diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc index dcac52397a..d5bfcfc48e 100644 --- a/src/runtime/operator.cc +++ b/src/runtime/operator.cc @@ -2,6 +2,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/simulator.h" #include +#include #include namespace FlexFlow { @@ -29,7 +30,15 @@ fs::path get_dst_folder(std::string const &subdir, if (before_kernel) { step_substr += "_pre"; } + char cwd[PATH_MAX]; + getcwd(cwd, sizeof(cwd)); + + // char const *ff_cache_path = std::string(std::getenv("FF_DEBUG_PATH")) == + // "." ? + // cwd : std::getenv("FF_DEBUG_PATH"); + char const *ff_cache_path = std::getenv("FF_CACHE_PATH"); + std::string debug_dir_ = ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow" : std::string("~/.cache/flexflow/debug/flexflow"); @@ -38,6 +47,9 @@ fs::path get_dst_folder(std::string const &subdir, debug_dir_ = p.we_wordv[0]; wordfree(&p); fs::path debug_dir = debug_dir_; + if (!fs::is_directory(debug_dir)) { + printf("invalid debug directory: %s\n", debug_dir.c_str()); + } assert(fs::is_directory(debug_dir)); fs::path dst_folder = debug_dir / subdir / step_substr / ("shard_" + std::to_string(shard_idx)); diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 9b6510fe5e..0e28c02cdf 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -3734,15 +3734,14 @@ bool FFModel::convert_graph_to_operators( case OP_INC_MULTIHEAD_SELF_ATTENTION: { assert(inList.size() == 1); IncMultiHeadSelfAttention *attn = (IncMultiHeadSelfAttention *)node.ptr; - new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0], true); + new_op = new IncMultiHeadSelfAttention(*this, *attn, inputs[0]); break; } case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: { assert(inList.size() == 1); TreeIncMultiHeadSelfAttention *attn = (TreeIncMultiHeadSelfAttention *)node.ptr; - new_op = - new TreeIncMultiHeadSelfAttention(*this, *attn, inputs[0], true); + new_op = new TreeIncMultiHeadSelfAttention(*this, *attn, inputs[0]); break; } case OP_RMS_NORM: { diff --git a/tests/fine_grained_alignment_test.sh b/tests/fine_grained_alignment_test.sh new file mode 100755 index 0000000000..9ad26318f9 --- /dev/null +++ b/tests/fine_grained_alignment_test.sh @@ -0,0 +1,106 @@ +#! /usr/bin/env bash +set -x +set -e + +MODEL_NAME=${MODEL_NAME:-"JackFram/llama-160m"} +MEMORY_PER_GPU=${MEMORY_PER_GPU:-14000} +ZCOPY_MEMORY=${ZCOPY_MEMORY:-40000} +TP_DEGREE=${TP_DEGREE:-2} +PP_DEGREE=${PP_DEGREE:-2} +CACHE_PATH=${FF_CACHE_PATH:-"~/.cache/flexflow"} +NUM_STEPS=${NUM_STEPS:-2} + +cleanup() { + rm -rf "${CACHE_PATH}"/debug ./fine_grained_alignment_config.json ./inference/output/fine_grained_alignment_test_ff.txt ./inference/output/fine_grained_alignment_test_hf.txt +} + +# Cd into directory holding this script +cd "${BASH_SOURCE[0]%/*}/.." + +# Initial cleanup +cleanup + +# Create test prompt file +mkdir -p ./inference/prompt +echo '["Three tips for staying healthy are: "]' > ./inference/prompt/test.json + +# Create output folder +mkdir -p ./inference/output + +# Enable backtrace in case we run into a segfault or assertion failure +export LEGION_BACKTRACE=1 +export FF_DEBG_NO_WEIGHTS=1 +FUSION=true + + +# Check if the Python code executed successfully +if ! PROMPT_LENGTH=$(python -c " +from transformers import AutoTokenizer +import os +tokenizer = AutoTokenizer.from_pretrained(\"$MODEL_NAME\") +tokens = tokenizer.tokenize('Three tips for staying healthy are: ') +print(len(tokens)) +"); +then + echo "Error: Failed to execute Python code" + exit 1 +fi + +MAX_LENGTH=$((PROMPT_LENGTH + NUM_STEPS + 1)) + +python ./tests/inference/huggingface_inference.py \ + --model-name "${MODEL_NAME}" \ + --max-length "${MAX_LENGTH}" \ + --prompt-file ../../inference/prompt/test.json \ + --output-file ../../inference/output/fine_grained_alignment_test_hf.txt \ + --use-full-precision \ + --inference-debugging + +NUM_GPUS=$((TP_DEGREE * PP_DEGREE)) +json_config=$(cat <<-END + { + "num_gpus": ${NUM_GPUS}, + "memory_per_gpu": ${MEMORY_PER_GPU}, + "zero_copy_memory_per_node": ${ZCOPY_MEMORY}, + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": ${TP_DEGREE}, + "pipeline_parallelism_degree": ${PP_DEGREE}, + "inference_debugging": true, + "fusion": ${FUSION}, + "refresh_cache": false, + "llm_model": "${MODEL_NAME}", + "cache_path": "${CACHE_PATH}", + "full_precision": true, + "prompt": "./inference/prompt/test.json", + "max_length": $MAX_LENGTH, + "output_file": "./inference/output/fine_grained_alignment_test_ff.txt" + } +END +) +echo "$json_config" > ./fine_grained_alignment_config.json + +python ./inference/python/incr_decoding.py -config-file ./fine_grained_alignment_config.json + +# # C++ test +# echo "C++ test" +# ./build/inference/incr_decoding/incr_decoding \ +# -ll:gpu 2 -ll:cpu 4 -ll:util 4 \ +# -tensor-parallelism-degree 2 \ +# -ll:fsize 8192 -ll:zsize 12000 \ +# -llm-model $MODEL_NAME \ +# -prompt ./inference/prompt/peft.json \ +# --use-full-precision \ +# --inference-debugging + +# Check alignment +python ./tests/inference/inference_alignment_test.py -m "$MODEL_NAME" -tp "$TP_DEGREE" -n "$NUM_STEPS" + +# Print succeess message +echo "" +echo "Inference alignment tests passed (model ${MODEL_NAME})!" +echo "" + +# Cleanup after the test +cleanup diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index 5e563c9974..fa72bef463 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -10,30 +10,9 @@ LlamaTokenizer, GenerationConfig, ) -######################### debugging helper functions ######################### -def pre_forward_hook(module, input): - assert module.name is not None and module.decoding_step is not None - name = module.name.replace("model.", "") - print( - f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}" - ) - print("Pre-Input: ", input[0].shape) - torch.save( - input, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.input" - ) -def post_forward_hook(module, input, output): - assert module.name is not None and module.decoding_step is not None - name = module.name.replace("model.", "") - print( - f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}" - ) - print("Post-Input/Output: ", input[0].shape, output[0].shape) - torch.save( - output, f"./hf_tensors/decoding_step_{module.decoding_step}_{name}.output" - ) - print("===") - module.decoding_step += 1 -############################################################################## +import sys +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "peft")) +from hf_utils import * def main(): # Change working dir to folder storing this script @@ -91,26 +70,20 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True) generation_config = GenerationConfig.from_pretrained(args.model_name) generation_config.do_sample = args.do_sample + if not args.do_sample: + generation_config.num_beams=1 + generation_config.temperature = None + generation_config.top_p = None ################# debugging ################# if args.inference_debugging: # Print model and configs print(hf_config) print(model) - # Save weights to file - shutil.rmtree("./hf_tensors") - # Check that the output folder exists - os.makedirs("./hf_tensors", exist_ok=True) + make_debug_dirs() + register_inference_hooks(model) # Save weights - for name, params in model.named_parameters(): - torch.save(params, f"./hf_tensors/{name}") - # params.detach().cpu().numpy().tofile(f"./hf_tensors/{name}") - # Register hooks to save per-op hidden states - for name, layer in dict(model.named_modules()).items(): - layer.name = name - layer.decoding_step = 0 - print(f"Adding hooks to layer {layer.name}") - layer.register_forward_pre_hook(pre_forward_hook) - layer.register_forward_hook(post_forward_hook) + save_model_weights(model, target_modules=["lora", "lm_head", "final_layer_norm", "self_attn_layer_norm", "out_proj", "fc1", "fc2"]) + ############################################### # Generate output with open(args.output_file, "w") as f: diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py new file mode 100644 index 0000000000..6fff4906f7 --- /dev/null +++ b/tests/inference/inference_alignment_test.py @@ -0,0 +1,817 @@ +import numpy as np +import os, torch, argparse, sys +sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "peft")) +from alignment.align_test_utils import * +from transformers import AutoConfig +from tqdm import tqdm + +class AlignmentTest: + def __init__(self, hf_config, tp_degree=1): + raise NotImplementedError() + def check_weights_alignment(self): + raise NotImplementedError() + def check_fwd_pass(self): + raise NotImplementedError() + def check_bwd_pass(self): + raise NotImplementedError() + def check_step(self, step_idx, learning_rate=0.001): + raise NotImplementedError() + +class LllamaAlignmentTest(AlignmentTest): + def __init__(self, hf_config, tp_degree=1): + self.hf_config = hf_config + self.num_layers = self.hf_config.num_hidden_layers + self.hidden_size = self.hf_config.hidden_size + self.intermediate_size = self.hf_config.intermediate_size + self.num_attention_heads = self.hf_config.num_attention_heads + self.num_key_value_heads = self.hf_config.num_key_value_heads + self.projsize = self.hidden_size // self.num_attention_heads + self.tp_degree = tp_degree + + self.num_tokens = None + self.ff_batch_size = None + + + def check_weights_alignment(self): + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "lm_head.weight": + f_version = f"layers.{self.num_layers-1}.lm_head.weight_0" + elif hf_filename == "norm.weight": + f_version = f"layers.{self.num_layers-1}.norm.weight_0" + else: + f_version = "" + if hf_filename.startswith("layers."): + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version += f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # compute weight index, then rename lora if needed if needed + weight_index="0" + if "lora_A" in f_version: + weight_index="A" + elif "lora_B" in f_version: + weight_index="B" + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + if f_version.endswith(".weight"): + if weight_index == "0": + f_version += f"_{weight_index}" + else: + f_version += f"_{weight_index}.original" + elif f_version.endswith(".gradient"): + prefix = f_version.split(".gradient")[0] + f_version = prefix + f".weight_{weight_index}.gradient" + return f_version + def get_tp_partition_dim(ff_weight_name) -> int: + # MLP layers split the intermediate size dimension + # gate_proj, up_proj: [hidden_size, intermediate_size] + # down_proj: [intermediate_size, hidden_size] + if self.tp_degree == 1: + return -1 + if "lora.weight_B" in ff_weight_name: + return -1 + if "lm_head" in ff_weight_name or "norm" in ff_weight_name: + return 1 + if "gate_proj" in ff_weight_name or "up_proj" in ff_weight_name: + return 1 + elif "down_proj" in ff_weight_name: + return 0 + else: + return -1 + print("-- Weights alignment --") + hf_weights_folder = os.path.join(hf_path, "weights", "step_0") + ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0") + files_list = os.listdir(hf_weights_folder) + for hf_weight_name in tqdm(sorted(files_list)): + if hf_weight_name.endswith(".weight"): + ff_weight_name = convert_hf_filename_to_ff(hf_weight_name) + # print(hf_weight_name, ff_weight_name) + hf_w_path = os.path.join(hf_weights_folder, hf_weight_name) + ff_w_path = os.path.join(ff_weights_folder, ff_weight_name) + if not os.path.isfile(hf_w_path): + print(f"File '{hf_w_path}' not found") + if not os.path.isfile(ff_w_path): + print(f"File '{ff_w_path}' not found") + assert(os.path.isfile(hf_w_path)) + assert(os.path.isfile(ff_w_path)) + + # 1. get shape of hf weight + hf_weight = torch.load(hf_w_path, map_location='cpu') + hf_weight_shape = hf_weight.shape + ff_partition_dim = get_tp_partition_dim(ff_weight_name) + ff_weight_shape = list(hf_weight_shape)[::-1] + if ff_partition_dim >= 0: + ff_weight_shape[ff_partition_dim] //= self.tp_degree + + # 2. handle flexflow shards in case of tensor parallelism + ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weight_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + if ff_partition_dim >= 0: + ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim) + else: + assert(are_np_arrays_identical(ff_weights)) + ff_weight = ff_weights[0] + else: + ff_weight = ff_weights[0] + ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype) + + # check equivalence + try: + torch.testing.assert_close(ff_weight, hf_weight.T) + except Exception as e: + print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n") + raise e + + def check_fwd_pass(self, step_idx=0): + hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}") + ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") + + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "embed_tokens": + f_version = f"layers.0.embed_tokens" + elif hf_filename == "lm_head" or hf_filename == "norm": + f_version = f"layers.{self.num_layers-1}.{hf_filename}" + else: + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix + f_version = f_version.replace(".q_proj", ".qkv_proj").replace(".k_proj", ".qkv_proj").replace(".v_proj", ".qkv_proj")#.replace(".o_proj", "") + return f_version + + def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): + hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}" + hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + print("loading hf tensor: ", hf_tensor_filename) + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + if hf_tensor_name == "embed_tokens": + self.num_tokens = hf_tensor.shape[1] + return hf_tensor + + def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE): + ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else "" + ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else "" + ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}" + ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename) + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + print("loading ff tensor: ", ff_tensor_filename) + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[0] //= self.tp_degree + + if "layers.0.embed_tokens.input_0" in ff_tensor_path: + # get number of tokens + ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',') + self.ff_batch_size = ff_tensor.shape[0] + + ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=0) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=0) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens) + return ff_tensor + + def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + if additional_ff_tensor is not None: + additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype) + ff_tensor = ff_tensor - additional_ff_tensor + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance) + if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print(hf_tensor.shape) + print("FF tensor:") + print(ff_tensor.squeeze()) + print(ff_tensor.shape) + raise e + + print(f"-- FWD pass {step_idx}--") + + # Embedding layer + hf_tensor_name = "embed_tokens" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding output") + + # Transformers blocks + for i in range(self.num_layers): + # Input laye norm + hf_tensor_name = f"layers.{i}.input_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + if i == 0: + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + else: + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} output") + + # Attention QKV projections + hf_q_proj_tensor_name = f"layers.{i}.self_attn.q_proj" + hf_k_proj_tensor_name = f"layers.{i}.self_attn.k_proj" + hf_v_proj_tensor_name = f"layers.{i}.self_attn.v_proj" + ff_qkv_tensor_name = convert_hf_filename_to_ff(hf_q_proj_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_q_proj_in = get_hf_tensor(hf_q_proj_tensor_name, input_comparison) + hf_k_proj_in = get_hf_tensor(hf_k_proj_tensor_name, input_comparison) + hf_v_proj_in = get_hf_tensor(hf_v_proj_tensor_name, input_comparison) + hf_q_proj_out = get_hf_tensor(hf_q_proj_tensor_name, output_comparison) + hf_k_proj_out = get_hf_tensor(hf_k_proj_tensor_name, output_comparison) + hf_v_proj_out = get_hf_tensor(hf_v_proj_tensor_name, output_comparison) + ff_qkv_tensor_in = get_ff_tensor(ff_qkv_tensor_name, input_comparison, hf_q_proj_in.shape) + torch.testing.assert_close(hf_q_proj_in, hf_k_proj_in) + torch.testing.assert_close(hf_k_proj_in, hf_v_proj_in) + compare(hf_q_proj_in, ff_qkv_tensor_in, label=f"QKV proj {i} input") + ff_qkv_tensor_out = get_ff_tensor( + ff_qkv_tensor_name, + output_comparison, + torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]), + tp_type=TPType.PARTITION + ) + head_dim = hf_q_proj_out.shape[2] // self.num_attention_heads + heads_per_shard = self.num_attention_heads // self.tp_degree + chunk_size = head_dim * heads_per_shard + # print(ff_qkv_tensor_out.shape) + ff_qproj_out = ff_qkv_tensor_out[:chunk_size, :, :] + ff_kproj_out = ff_qkv_tensor_out[chunk_size:2*chunk_size, :, :] + ff_vproj_out = ff_qkv_tensor_out[2*chunk_size : 3*chunk_size, :, :] + qkv_chunk_size = 3*chunk_size + for tp_idx in range(1, self.tp_degree): + prev_size = tp_idx * qkv_chunk_size + ff_qproj_out_ = ff_qkv_tensor_out[prev_size : prev_size + chunk_size, :, :] + ff_kproj_out_ = ff_qkv_tensor_out[prev_size + chunk_size : prev_size + 2*chunk_size, :, :] + ff_vproj_out_ = ff_qkv_tensor_out[prev_size + 2*chunk_size : prev_size + 3*chunk_size, :, :] + ff_qproj_out = np.concatenate((ff_qproj_out, ff_qproj_out_), axis=0) + ff_kproj_out = np.concatenate((ff_kproj_out, ff_kproj_out_), axis=0) + ff_vproj_out = np.concatenate((ff_vproj_out, ff_vproj_out_), axis=0) + compare_loaded_tensors(hf_q_proj_out.T, ff_qproj_out) + compare_loaded_tensors(hf_k_proj_out.T, ff_kproj_out) + compare_loaded_tensors(hf_v_proj_out.T, ff_vproj_out) + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + ff_attn_tensor_in = get_ff_tensor( + ff_tensor_name, + input_comparison, + torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]), + tp_type=TPType.PARTITION + ) + assert torch.allclose(ff_qkv_tensor_out, ff_attn_tensor_in) + + # Attention + hf_tensor_name = f"layers.{i}.self_attn.o_proj" + ff_tensor_name = convert_hf_filename_to_ff(f"layers.{i}.self_attn") + # the raw attention result, w/o o_proj. This is the output of senf_attn of FF and the input of o_proj in HF + output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + # TP for self-attn partitions the attention heads across TP workers + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name) + compare(hf_tensor, ff_tensor, label=f"Attention {i} output") + + # Post-attention layernorm + hf_tensor_name = f"layers.{i}.post_attention_layernorm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Post-attention layernorm {i} output") + + # W1 (gate_proj) + hf_tensor_name = f"layers.{i}.mlp.gate_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W1 {i} output") + + # W3 (up_proj) + hf_tensor_name = f"layers.{i}.mlp.up_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W3 {i} output") + + # W2 (down_proj) + hf_tensor_name = f"layers.{i}.mlp.down_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"W2 {i} input") + + hf_down_proj_in = hf_tensor.clone() + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + + # Norm + hf_tensor_name = "norm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Norm output") + + # LM head + hf_tensor_name = "lm_head" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label="LM head input") + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label="LM head output") + +class OPTAlignmentTest(AlignmentTest): + def __init__(self, hf_config, tp_degree=1): + self.hf_config = hf_config + self.num_layers = self.hf_config.num_hidden_layers + self.hidden_size = self.hf_config.hidden_size + self.intermediate_size = self.hf_config.ffn_dim + self.num_attention_heads = self.hf_config.num_attention_heads + self.num_key_value_heads = self.num_attention_heads + self.projsize = self.hidden_size // self.num_attention_heads + self.tp_degree = tp_degree + + self.num_tokens = None + self.ff_batch_size = None + + def check_weights_alignment(self): + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "lm_head.weight" or hf_filename == "final_layer_norm.weight": + f_version = f"layers.{self.num_layers-1}.{hf_filename}_0" + elif hf_filename == "lm_head.bias" or hf_filename == "final_layer_norm.bias": + f_version = f"layers.{self.num_layers-1}.{hf_filename.replace('bias', 'weight')}_1" + elif hf_filename.startswith("layers.") and hf_filename.endswith("self_attn.out_proj.bias"): + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}.layers.{layernum}.add_bias_residual_layer_norm.weight_0" + elif hf_filename.startswith("layers.") and hf_filename.endswith(".final_layer_norm.weight"): + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}.layers.{layernum}.add_bias_residual_layer_norm.weight_1" + elif hf_filename.startswith("layers.") and hf_filename.endswith(".final_layer_norm.bias"): + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}.layers.{layernum}.add_bias_residual_layer_norm.weight_2" + else: + f_version = "" + if hf_filename.startswith("layers."): + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version += f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "").replace("out_proj", "o_proj") + # compute weight index, then rename lora if needed if needed + weight_index="0" + if "lora_A" in f_version: + weight_index="A" + elif "lora_B" in f_version: + weight_index="B" + f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora") + if f_version.endswith(".weight"): + if weight_index == "0": + f_version += f"_{weight_index}" + else: + f_version += f"_{weight_index}.original" + elif f_version.endswith(".gradient"): + prefix = f_version.split(".gradient")[0] + f_version = prefix + f".weight_{weight_index}.gradient" + elif f_version.endswith(".bias"): + f_version = f_version.replace(".bias", ".weight_1") + return f_version + def get_tp_partition_dim(ff_weight_name) -> int: + # MLP layers split the intermediate size dimension + # gate_proj, up_proj: [hidden_size, intermediate_size] + # down_proj: [intermediate_size, hidden_size] + if self.tp_degree == 1: + return -1 + if "lora.weight_B" in ff_weight_name: + return -1 + if "lm_head" in ff_weight_name or "fc1" in ff_weight_name: + return 1 + elif "fc2" in ff_weight_name or "o_proj.weight" in ff_weight_name: + return 0 + else: + return -1 + def get_bias_tp_partition_dim(ff_weight_name) -> int: + if self.tp_degree == 1: + return -1 + elif "lm_head" in ff_weight_name or "fc1" in ff_weight_name: + return 0 + else: + return -1 + print("-- Weights alignment --") + hf_weights_folder = os.path.join(hf_path, "weights", "step_0") + ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0") + files_list = os.listdir(hf_weights_folder) + for hf_weight_name in tqdm(sorted(files_list)): + if hf_weight_name.endswith(".weight") or hf_weight_name.endswith(".bias"): + ff_weight_name = convert_hf_filename_to_ff(hf_weight_name) + # print(hf_weight_name, ff_weight_name) + hf_w_path = os.path.join(hf_weights_folder, hf_weight_name) + ff_w_path = os.path.join(ff_weights_folder, ff_weight_name) + if not os.path.isfile(hf_w_path): + print(f"File '{hf_w_path}' not found") + if not os.path.isfile(ff_w_path): + print(f"File '{ff_w_path}' not found") + assert(os.path.isfile(hf_w_path)) + assert(os.path.isfile(ff_w_path)) + + # 1. get shape of hf weight + hf_weight = torch.load(hf_w_path, map_location='cpu') + hf_weight_shape = hf_weight.shape + ff_partition_dim = get_tp_partition_dim(ff_weight_name) if hf_weight_name.endswith(".weight") else get_bias_tp_partition_dim(ff_weight_name) + ff_weight_shape = list(hf_weight_shape)[::-1] + # print(ff_partition_dim, ff_weight_name, hf_w_path, ff_weight_shape) + if ff_partition_dim >= 0: + ff_weight_shape[ff_partition_dim] //= self.tp_degree + + # 2. handle flexflow shards in case of tensor parallelism + if hf_weight_name.endswith(".bias") and ff_partition_dim == -1: + # unpartitioned bias (E.g. replicated bias) only lives on shard 0 + ff_weight = load_ff_tensor(ff_w_path, ff_weight_shape) + else: + ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weight_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + if ff_partition_dim >= 0: + ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim) + else: + assert(are_np_arrays_identical(ff_weights)) + ff_weight = ff_weights[0] + else: + ff_weight = ff_weights[0] + ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype) + # print("comparing weight tensor: ", hf_weight_name, " and ", ff_weight_name) + # check equivalence + try: + torch.testing.assert_close(ff_weight, hf_weight.T) + except Exception as e: + print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n") + raise e + + def check_fwd_pass(self, step_idx=0): + hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}") + ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") + + def convert_hf_filename_to_ff(hf_filename): + if hf_filename == "embed_tokens" or hf_filename == "embed_positions": + f_version = f"layers.0.{hf_filename}" + elif hf_filename == "lm_head" or hf_filename == "final_layer_norm": + f_version = f"layers.{self.num_layers-1}.{hf_filename}" + else: + assert hf_filename.startswith("layers.") + layernum = hf_filename.split("layers.")[1].split(".")[0] + f_version = f"layers.{layernum}." + f_version += hf_filename.replace(".base_layer", "").replace(".default", "") + # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix + f_version = f_version.replace(".q_proj", ".qkv_proj").replace(".k_proj", ".qkv_proj").replace(".v_proj", ".qkv_proj") + return f_version + + def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): + hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}" + hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename) + + if not os.path.isfile(hf_tensor_path): + raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + print("loading hf tensor: ", hf_tensor_filename) + hf_tensor = torch.load(hf_tensor_path, map_location='cpu') + if hf_tensor_name == "embed_tokens": + self.num_tokens = hf_tensor.shape[1] + return hf_tensor + + def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE): + ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else "" + ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else "" + ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}" + ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename) + if not os.path.isfile(ff_tensor_path): + raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + + print("loading ff tensor: ", ff_tensor_filename) + ff_shape = list(hf_shape)[::-1] + if tp_type == TPType.PARTITION: + ff_shape[0] //= self.tp_degree + + if "layers.0.embed_tokens.input_0" in ff_tensor_path: + # get number of tokens + ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',') + self.ff_batch_size = ff_tensor.shape[0] + + ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) + ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] + if self.tp_degree > 1: + # if replicate, check that they are identical + if tp_type == TPType.REPLICATE: + assert(are_np_arrays_identical(ff_tensors)) + ff_tensor = ff_tensors[0] + # if partition, concatenate along the partition dimension + elif tp_type == TPType.PARTITION: + ff_tensor = np.concatenate(ff_tensors, axis=0) + # if to_reduce, sum along the partition dimension + elif tp_type == TPType.TO_REDUCE: + ff_tensor = np.sum(ff_tensors, axis=0) + else: + ff_tensor = ff_tensors[0] + ff_tensor = torch.from_numpy(ff_tensor) + ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens) + return ff_tensor + + def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2): + ff_tensor = ff_tensor.to(hf_tensor.dtype) + hf_tensor = hf_tensor.T + if additional_ff_tensor is not None: + additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype) + ff_tensor = ff_tensor - additional_ff_tensor + try: + # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance) + if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance): + mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0] + print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%") + assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel()) + except Exception as e: + print(f"Error in comparison {label}:\n{e}\n") + print("HF tensor:") + print(hf_tensor.squeeze()) + print(hf_tensor.shape) + print("FF tensor:") + print(ff_tensor.squeeze()) + print(ff_tensor.shape) + raise e + + print(f"-- FWD pass {step_idx}--") + + # Embedding layer + hf_tensor_name = "embed_tokens" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Embedding output") + + # Positional embedding layer + hf_tensor_name = "embed_positions" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Position Embedding output") + + # Transformers blocks + for i in range(self.num_layers): + # Input layer norm + hf_tensor_name = f"layers.{i}.self_attn_layer_norm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Self attention layernorm {i} input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label=f"Self attention layernorm {i} output") + + # Attention QKV projections + hf_q_proj_tensor_name = f"layers.{i}.self_attn.q_proj" + hf_k_proj_tensor_name = f"layers.{i}.self_attn.k_proj" + hf_v_proj_tensor_name = f"layers.{i}.self_attn.v_proj" + ff_qkv_tensor_name = convert_hf_filename_to_ff(hf_q_proj_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_q_proj_in = get_hf_tensor(hf_q_proj_tensor_name, input_comparison) + hf_k_proj_in = get_hf_tensor(hf_k_proj_tensor_name, input_comparison) + hf_v_proj_in = get_hf_tensor(hf_v_proj_tensor_name, input_comparison) + hf_q_proj_out = get_hf_tensor(hf_q_proj_tensor_name, output_comparison) + hf_k_proj_out = get_hf_tensor(hf_k_proj_tensor_name, output_comparison) + hf_v_proj_out = get_hf_tensor(hf_v_proj_tensor_name, output_comparison) + ff_qkv_tensor_in = get_ff_tensor(ff_qkv_tensor_name, input_comparison, hf_q_proj_in.shape) + torch.testing.assert_close(hf_q_proj_in, hf_k_proj_in) + torch.testing.assert_close(hf_k_proj_in, hf_v_proj_in) + compare(hf_q_proj_in, ff_qkv_tensor_in, label=f"QKV proj {i} input") + ff_qkv_tensor_out = get_ff_tensor( + ff_qkv_tensor_name, + output_comparison, + torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]), + tp_type=TPType.PARTITION + ) + head_dim = hf_q_proj_out.shape[2] // self.num_attention_heads + heads_per_shard = self.num_attention_heads // self.tp_degree + chunk_size = head_dim * heads_per_shard + # print(ff_qkv_tensor_out.shape) + ff_qproj_out = ff_qkv_tensor_out[:chunk_size, :, :] + ff_kproj_out = ff_qkv_tensor_out[chunk_size:2*chunk_size, :, :] + ff_vproj_out = ff_qkv_tensor_out[2*chunk_size : 3*chunk_size, :, :] + qkv_chunk_size = 3*chunk_size + for tp_idx in range(1, self.tp_degree): + prev_size = tp_idx * qkv_chunk_size + ff_qproj_out_ = ff_qkv_tensor_out[prev_size : prev_size + chunk_size, :, :] + ff_kproj_out_ = ff_qkv_tensor_out[prev_size + chunk_size : prev_size + 2*chunk_size, :, :] + ff_vproj_out_ = ff_qkv_tensor_out[prev_size + 2*chunk_size : prev_size + 3*chunk_size, :, :] + ff_qproj_out = np.concatenate((ff_qproj_out, ff_qproj_out_), axis=0) + ff_kproj_out = np.concatenate((ff_kproj_out, ff_kproj_out_), axis=0) + ff_vproj_out = np.concatenate((ff_vproj_out, ff_vproj_out_), axis=0) + compare_loaded_tensors(hf_q_proj_out.T, ff_qproj_out) + compare_loaded_tensors(hf_k_proj_out.T, ff_kproj_out) + compare_loaded_tensors(hf_v_proj_out.T, ff_vproj_out) + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + ff_attn_tensor_in = get_ff_tensor( + ff_tensor_name, + input_comparison, + torch.Size([hf_q_proj_out.shape[0], hf_q_proj_out.shape[1], 3*hf_q_proj_out.shape[2]]), + tp_type=TPType.PARTITION + ) + assert torch.allclose(ff_qkv_tensor_out, ff_attn_tensor_in) + + # Compared scaled qproj + hf_tensor_name = f"layers.{i}.self_attn.scaled_qproj" + input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + scaled_qproj_in = get_hf_tensor(hf_tensor_name, input_c) + scaled_qproj_out = get_hf_tensor(hf_tensor_name, output_c) + assert torch.allclose(scaled_qproj_in, scaled_qproj_out) + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.scaled_qkv_proj" + scaled_qkv_proj0 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0"), [64*6,3,9]) + scaled_qkv_proj1 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0").replace("shard_0", "shard_1"), [64*6,3,9]) + ff_scaled_qkv_proj = np.concatenate([scaled_qkv_proj0, scaled_qkv_proj1], axis=0) + ff_scaled_q_proj = torch.from_numpy(ff_scaled_qkv_proj[:, :1, :]).to(scaled_qproj_out.dtype) + # print("HF scaled qproj:") + # print(scaled_qproj_out.squeeze().T) + # print("FF scaled q proj:") + # print(ff_scaled_q_proj.squeeze()) + # print("HF unscaled qproj:") + # print(hf_q_proj_out.squeeze().T) + # print("FF unscaled qproj:") + # print(torch.from_numpy(ff_qproj_out.squeeze()).to(scaled_qproj_out.dtype)) + # assert torch.allclose(hf_q_proj_out.squeeze().T, ff_scaled_q_proj.squeeze()) + + + + # check that out_proj input, attn_scores out and input are identical on the hf side + hf_tensor_name = f"layers.{i}.self_attn.attn_scores" + input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + attn_scores_in = get_hf_tensor(hf_tensor_name, input_c) + attn_scores_out = get_hf_tensor(hf_tensor_name, output_c) + hf_tensor_name = f"layers.{i}.self_attn.out_proj" + out_proj_in = get_hf_tensor(hf_tensor_name, input_c) + assert torch.allclose(attn_scores_in, attn_scores_out) + assert torch.allclose(attn_scores_in, out_proj_in) + + # Compare out proj input. This should be the output of the attention without any bias involved + hf_tensor_name = f"layers.{i}.self_attn.out_proj" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name) + compare(hf_tensor, ff_tensor, label=f"Attention o-proj {i} input") + + hf_tensor_name = f"layers.{i}.self_attn.attn_scores" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"Attention {i} output") + + # hf_tensor_name = f"layers.{i}.final_layer_norm" + # ff_tensor_name = f"layers.{i}.layers.{i}.add_bias_residual_layer_norm" + # output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + # compare(hf_tensor, ff_tensor, label=f"Add Bias Residula LN {i} output 0") + + hf_tensor_name = f"layers.{i}.self_attn.out_proj" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name.replace(".out_proj", ".o_proj")) + # # the raw attention result, w/o o_proj. This is the output of senf_attn of FF and the input of o_proj in HF + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + # # TP for self-attn partitions the attention heads across TP workers + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name) + # compare(hf_tensor, ff_tensor, label=f"Attention oproj {i} output") + + # hf_tensor_name = f"layers.{i}.self_attn.out_proj" + # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + # output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + # print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name) + # compare(hf_tensor, ff_tensor, label=f"Attention {i} output") + + + + # # Post-attention layernorm + # hf_tensor_name = f"layers.{i}.add_bias_residual_layer_norm" + # ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + # output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + # compare(hf_tensor, ff_tensor, label=f"Add bias residual layernorm {i} output") + + # FC1 (+ ReLU) + hf_tensor_name = f"layers.{i}.activation_fn" + ff_tensor_name = convert_hf_filename_to_ff(f"layers.{i}.fc1") + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"FC1 {i} output") + + # FC2 + hf_tensor_name = f"layers.{i}.fc2" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label=f"FC2 {i} input") + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + # compare(hf_tensor, ff_tensor, label=f"FC2 {i} output") + + hf_down_proj_in = hf_tensor.clone() + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + + # Norm + hf_tensor_name = "final_layer_norm" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape) + compare(hf_tensor, ff_tensor, label="Final layer norm output") + + # LM head + hf_tensor_name = "lm_head" + ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) + input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + compare(hf_tensor, ff_tensor, label="LM head input") + output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + compare(hf_tensor, ff_tensor, label="LM head output") + +parser = argparse.ArgumentParser(description='Argument Parser Example') +# Adding arguments +parser.add_argument('-m', '--model-name', type=str, default="goliaro/llama-160m-lora", help='Name of the model') +parser.add_argument('-n', '--num-steps', type=int, default=1, help='Number of decoding steps') +parser.add_argument('-tp', '--tensor-parallelism-degree', type=int, default=1, help='The tensor parallelism degree used when running FlexFlow') + +# Parse the arguments from command line +args = parser.parse_args() + +if __name__ == "__main__": + hf_config = AutoConfig.from_pretrained(args.model_name) + alignment_class = None + if hf_config.architectures[0] == "LlamaForCausalLM": + alignment_class = LllamaAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree) + elif hf_config.architectures[0] == "OPTForCausalLM": + alignment_class = OPTAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree) + + # alignment_class.check_weights_alignment() + for i in range(args.num_steps): + alignment_class.check_fwd_pass(i) diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index 93727bdc89..3085bbda56 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -3,6 +3,8 @@ from typing import List from enum import Enum from dataclasses import dataclass +import warnings + abs_dirname = os.path.dirname(os.path.abspath(__file__)) cache_folder = os.path.expanduser(os.getenv("FF_CACHE_PATH", "~/.cache/flexflow")) @@ -472,7 +474,16 @@ def replace_value(lst, old_value, new_value): if occurrences == 0: raise ValueError(f"Value {old_value} not found in the list.") elif occurrences > 1: - raise ValueError(f"Multiple instances of {old_value} found in the list.") + warnings.warn(f"Multiple instances of {old_value} found in the list.") + occurrence_idx=0 + for i, value in enumerate(lst): + if value == old_value: + occurrence_idx += 1 + if occurrence_idx == 2: + lst[i] = new_value + break + return lst + # raise ValueError(f"Multiple instances of {old_value} found in the list.") else: index = lst.index(old_value) lst[index] = new_value diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index 16b46cfa81..a2fc5548ab 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -77,7 +77,7 @@ def main(): if args.save_peft_tensors: make_debug_dirs() register_peft_hooks(model) - save_peft_weights(model, target_modules=["lora", "lm_head", "down_proj"]) + save_model_weights(model, target_modules=["lora", "lm_head", "down_proj"]) # Load fine-tuning dataset data = load_dataset("Abirate/english_quotes") diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py index 9332c803b2..94fb96f029 100644 --- a/tests/peft/hf_utils.py +++ b/tests/peft/hf_utils.py @@ -40,7 +40,7 @@ def get_dst_folder(subdir, step_idx=0): def simplify_name(name): - return name.replace("base_model.model.model.", "").replace("base_model.model.", "") + return name.replace("base_model.model.model.", "").replace("base_model.model.", "").replace("model.layers.", "layers.").replace("model.", "").replace("decoder.", "") def get_optim_type(args): @@ -114,7 +114,7 @@ def peft_backward_hook(module, grad_input, grad_output): module.bwd_step += 1 -def peft_forward_hook(module, input, output): +def fwd_hook(module, input, output): if len(input) == 0 or len(output) == 0: return assert module.name is not None and module.fwd_step is not None @@ -312,11 +312,18 @@ def register_peft_hooks(model): layer.bwd_step = 0 if verbose: print(f"Adding hooks to layer {layer.name}") - layer.register_forward_hook(peft_forward_hook) + layer.register_forward_hook(fwd_hook) layer.register_full_backward_hook(peft_backward_hook) +def register_inference_hooks(model): + for name, layer in dict(model.named_modules()).items(): + layer.name = name + layer.fwd_step = 0 + if verbose: + print(f"Adding hooks to layer {layer.name}") + layer.register_forward_hook(fwd_hook) -def save_peft_weights(model, target_modules=[]): +def save_model_weights(model, target_modules=[]): # Save any weights of interest for name, params in model.named_parameters(): simplified_name = simplify_name(name) diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index 266bb64137..cc677cd51a 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -98,14 +98,14 @@ def get_tp_partition_dim(ff_weight_name) -> int: # 1. get shape of hf weight hf_weight = torch.load(hf_w_path, map_location='cpu') - hf_weigth_shape = hf_weight.shape + hf_weight_shape = hf_weight.shape ff_partition_dim = get_tp_partition_dim(ff_weight_name) - ff_weigth_shape = list(hf_weigth_shape)[::-1] + ff_weight_shape = list(hf_weight_shape)[::-1] if ff_partition_dim >= 0: - ff_weigth_shape[ff_partition_dim] //= self.tp_degree + ff_weight_shape[ff_partition_dim] //= self.tp_degree # 2. handle flexflow shards in case of tensor parallelism - ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weigth_shape) for tp_idx in range(self.tp_degree)] + ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weight_shape) for tp_idx in range(self.tp_degree)] if self.tp_degree > 1: if ff_partition_dim >= 0: ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim) @@ -149,6 +149,7 @@ def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): if not os.path.isfile(hf_tensor_path): raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + print("loading hf tensor: ", hf_tensor_filename) hf_tensor = torch.load(hf_tensor_path, map_location='cpu') if hf_tensor_name == "embed_tokens": self.num_tokens = hf_tensor.shape[1] @@ -162,6 +163,7 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp if not os.path.isfile(ff_tensor_path): raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + print("loading ff tensor: ", ff_tensor_filename) ff_shape = list(hf_shape)[::-1] if tp_type == TPType.PARTITION: ff_shape[0] //= self.tp_degree @@ -206,8 +208,10 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance print(f"Error in comparison {label}:\n{e}\n") print("HF tensor:") print(hf_tensor.squeeze()) + print(hf_tensor.shape) print("FF tensor:") print(ff_tensor.squeeze()) + print(ff_tensor.shape) raise e print(f"-- FWD pass {step_idx}--") @@ -245,9 +249,13 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance # Attention hf_tensor_name = f"layers.{i}.self_attn.o_proj" ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) - output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # the raw attention result, w/o o_proj. This is the output of senf_attn of FF and the input of o_proj in HF + output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) - ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + # TP for self-attn partitions the attention heads across TP workers + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name) compare(hf_tensor, ff_tensor, label=f"Attention {i} output") # Post-attention layernorm @@ -365,6 +373,7 @@ def get_hf_tensor(hf_tensor_name, tensor_comparison_idx): if not os.path.isfile(hf_tensor_path): raise FileNotFoundError(f"File '{hf_tensor_path}' not found") + print("loading hf tensor: ", hf_tensor_filename) hf_tensor = torch.load(hf_tensor_path, map_location='cpu') return hf_tensor @@ -378,6 +387,7 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre") if not os.path.isfile(ff_tensor_path): raise FileNotFoundError(f"File '{ff_tensor_path}' not found") + print("loading ff tensor: ", ff_tensor_filename) ff_shape = list(hf_shape)[::-1] if tp_type == TPType.PARTITION: @@ -392,8 +402,10 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp tensor_comparison_idx.ff_tensor_type == "output_gradient" or tensor_comparison_idx.ff_tensor_type == "input_gradient" ) - ) + ) and + not ff_tensor_name.endswith(".self_attn.qkv_proj") ) + print(ff_tensor_filename + (" is not truncated" if intermediate_attention_tensor else " is truncated")) if not intermediate_attention_tensor: ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) @@ -432,8 +444,10 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance print(f"Error in comparison {label}:\n{e}\n") print("HF tensor:") print(hf_tensor.squeeze()) + print(hf_tensor.shape) print("FF tensor:") print(ff_tensor.squeeze()) + print(ff_tensor.shape) raise e print(f"-- BWD pass {step_idx}--") @@ -533,11 +547,12 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance # Attn O-proj hf_tensor_name = f"layers.{i}.self_attn.o_proj" - ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj" + # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) - hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) - ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) - compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output") + # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + # compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output") ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj" input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) @@ -579,7 +594,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance # FF Attn input with HF layernorm out hf_tensor_name = f"layers.{i}.input_layernorm" - ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.qkv_proj" input_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) From 6da4f4ad0cb20cbc54da9acb9d736fdbb34a082e Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 1 Oct 2024 04:41:28 +0000 Subject: [PATCH 30/44] Add support for max_new_tokens parameter --- include/flexflow/batch_config.h | 4 +- include/flexflow/flexflow_c.h | 3 +- include/flexflow/request_manager.h | 3 +- inference/incr_decoding/incr_decoding.cc | 2 +- inference/peft/peft.cc | 2 +- inference/peft/peft_bwd_benchmark.cc | 6 +- inference/peft/peft_fwd_benchmark.cc | 2 +- inference/peft/req_rate_benchmark.cc | 8 +- inference/spec_infer/spec_infer.cc | 2 +- python/flexflow/core/flexflow_cffi.py | 59 +++++------ python/flexflow/serve/serve.py | 11 ++- src/c/flexflow_c.cc | 32 ++++-- src/ops/add_bias_residual_layer_norm.cpp | 2 +- src/ops/add_bias_residual_layer_norm.cu | 2 +- src/ops/kernels/linear_kernels.cpp | 2 +- src/ops/kernels/linear_kernels.cu | 2 +- src/ops/kernels/lora_linear_kernels.cpp | 2 +- src/ops/kernels/lora_linear_kernels.cu | 2 +- src/ops/kernels/residual_rms_norm_kernels.cpp | 2 +- src/ops/kernels/residual_rms_norm_kernels.cu | 2 +- src/ops/kernels/rms_norm_kernels.cpp | 2 +- src/ops/kernels/rms_norm_kernels.cu | 2 +- src/ops/layer_norm.cpp | 2 +- src/ops/layer_norm.cu | 2 +- src/ops/residual_layer_norm.cpp | 2 +- src/ops/residual_layer_norm.cu | 2 +- src/ops/sigmoid_silu_multi.cpp | 2 +- src/ops/sigmoid_silu_multi.cu | 2 +- src/runtime/batch_config.cc | 4 +- src/runtime/beam_search_batch_config.cc | 4 +- src/runtime/request_manager.cc | 97 ++++++++++++------- src/runtime/tree_verify_batch_config.cc | 4 +- 32 files changed, 164 insertions(+), 111 deletions(-) diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 873fed0bdb..a509af765c 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -87,7 +87,7 @@ class BatchConfig { first_token_depth_in_request = 0; first_token_offset_in_batch = 0; num_tokens_in_batch = 0; - max_sequence_length = 0; + max_length = 0; request_guid = 0; prompt_phase = false; batch_config_request_id = -1; @@ -98,7 +98,7 @@ class BatchConfig { int first_token_depth_in_request; int first_token_offset_in_batch; int num_tokens_in_batch; - int max_sequence_length; + int max_length; // request id in batch config: int batch_config_request_id = -1; diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index c1e18e660b..52f67d8efb 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -651,7 +651,8 @@ void flexflow_model_generate(flexflow_model_t handle_, enum RequestType *request_types, char const **input_texts, char **output_texts, - int *max_seq_lengths, + int *max_lengths, + int *max_new_tokens_, flexflow_peft_model_id_t *peft_model_ids, char const **dataset_filepaths, int *training_steps, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index f0fab957ee..36a56012fc 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -67,7 +67,8 @@ struct Request { }; BatchConfig::RequestGuid guid; PEFTModelID peft_model_id = PEFTModelID::NO_ID; - int max_sequence_length = 128; + int max_length = -1; + int max_new_tokens = 128; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index c9ffff5c07..f8e16f24fa 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -271,7 +271,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Prompt[%d]: %s\n", total_num_requests, text.c_str()); Request inference_req; inference_req.prompt = text; - inference_req.max_sequence_length = 128; + inference_req.max_length = 128; requests.push_back(inference_req); total_num_requests++; } diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index c55f2c0bfd..ee5bd1b460 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str()); Request inference_req; inference_req.prompt = text; - inference_req.max_sequence_length = 128; + inference_req.max_length = 128; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc index 86d6d8cbbf..df9a1e35db 100644 --- a/inference/peft/peft_bwd_benchmark.cc +++ b/inference/peft/peft_bwd_benchmark.cc @@ -308,7 +308,7 @@ void FlexFlow::top_level_task(Task const *task, for (int i = 0; i < 100; i++) { Request inference_req; inference_req.benchmarking_tokens = 128; - inference_req.max_sequence_length = 256; + inference_req.max_length = 256; inference_req.warmup = true; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -317,7 +317,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.max_length = 1024; fine_tuning_req.warmup = true; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -361,7 +361,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = lengths[i]; - fine_tuning_req.max_sequence_length = lengths[i]; + fine_tuning_req.max_length = lengths[i]; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; fine_tuning_req.max_training_steps = 1; diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc index 9ff042c157..9b020f5954 100644 --- a/inference/peft/peft_fwd_benchmark.cc +++ b/inference/peft/peft_fwd_benchmark.cc @@ -333,7 +333,7 @@ void FlexFlow::top_level_task(Task const *task, // sequence_length); Request inference_req; inference_req.benchmarking_tokens = prompt.first; - inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.max_length = prompt.second + prompt.first; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc index 43008e74fe..cde3b1c02e 100644 --- a/inference/peft/req_rate_benchmark.cc +++ b/inference/peft/req_rate_benchmark.cc @@ -369,7 +369,7 @@ void FlexFlow::top_level_task(Task const *task, for (int i = 0; i < 100; i++) { Request inference_req; inference_req.benchmarking_tokens = 128; - inference_req.max_sequence_length = 256; + inference_req.max_length = 256; inference_req.warmup = true; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -379,7 +379,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.max_length = 1024; fine_tuning_req.warmup = true; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; @@ -443,7 +443,7 @@ void FlexFlow::top_level_task(Task const *task, Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.benchmarking_tokens = 1024; - fine_tuning_req.max_sequence_length = 1024; + fine_tuning_req.max_length = 1024; fine_tuning_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; fine_tuning_req.max_training_steps = 1000000000; @@ -473,7 +473,7 @@ void FlexFlow::top_level_task(Task const *task, // sequence_length); Request inference_req; inference_req.benchmarking_tokens = prompt.first; - inference_req.max_sequence_length = prompt.second + prompt.first; + inference_req.max_length = prompt.second + prompt.first; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 9689080825..134ae70c4a 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -421,7 +421,7 @@ void FlexFlow::top_level_task(Task const *task, // Add inference request Request inference_req; inference_req.prompt = text; - inference_req.max_sequence_length = 128; + inference_req.max_length = 128; requests.push_back(inference_req); total_num_requests++; } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index a5aadc270e..9b35b249d9 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -38,9 +38,10 @@ ) from flexflow.config import * from .flexflowlib import ffi, flexflow_library -from typing import Union, List +from typing import Union, List, Optional +from dataclasses import dataclass from peft import LoraConfig -import json +import json, math from dataclasses import dataclass @@ -2050,25 +2051,16 @@ def no_id_handle(): # Request # ----------------------------------------------------------------------- - +@dataclass class Request: """A class to record the metadata of an inference or finetuning request.""" - - def __init__( - self, - req_type: RequestType, - prompt: str = None, - max_sequence_length: int = 128, - peft_model_id: PEFTModelID = None, - dataset_filepath: str = None, - max_training_steps: int = 1, - ): - self.req_type = req_type - self.prompt = prompt - self.max_sequence_length = max_sequence_length - self.peft_model_id = peft_model_id - self.dataset_filepath = dataset_filepath - self.max_training_steps = max_training_steps + req_type: RequestType + prompt: Optional[str] = None + max_length: int = -1 + max_new_tokens: int = 128 + peft_model_id: Optional[PEFTModelID] = None + dataset_filepath: Optional[str] = None + max_training_steps: int = 1 # ----------------------------------------------------------------------- @@ -4658,19 +4650,23 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128): + def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128): + if max_length != -1 and max_new_tokens != -1: + warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.") assert isinstance(prompt_list, list) c_input_texts = [get_c_name(prompt) for prompt in prompt_list] - max_num_chars = 5 * (max_sequence_length + 100) + estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length + max_num_chars = 5 * (estimated_max_tokens + 100) c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] c_output_length_and_tokens = [ - ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list + ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list ] c_request_types = [ enum_to_int(RequestType, RequestType.REQ_INFERENCE) for prompt in prompt_list ] - max_sequence_lengths = [max_sequence_length for prompt in prompt_list] + max_lengths = [max_length for prompt in prompt_list] + max_new_tokens_ = [max_new_tokens for prompt in prompt_list] peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list] dataset_filepaths = [ffi.NULL for prompt in prompt_list] training_steps = [0 for prompt in prompt_list] @@ -4682,7 +4678,8 @@ def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 1 c_request_types, c_input_texts, c_output_texts, - max_sequence_lengths, + max_lengths, + max_new_tokens_, peft_model_ids, dataset_filepaths, training_steps, @@ -4719,9 +4716,16 @@ def generate(self, requests_list: List[Request]): c_request_types = [ enum_to_int(RequestType, request.req_type) for request in requests_list ] - max_sequence_lengths = [ - request.max_sequence_length for request in requests_list + max_lengths = [ + request.max_length for request in requests_list ] + max_new_tokens_ = [ + request.max_new_tokens for request in requests_list + ] + for i in range(len(requests_list)): + if max_lengths[i] != -1 and max_new_tokens_[i] != -1: + warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.") + peft_model_ids = [ ( request.peft_model_id @@ -4745,7 +4749,8 @@ def generate(self, requests_list: List[Request]): c_request_types, c_input_texts, c_output_texts, - max_sequence_lengths, + max_lengths, + max_new_tokens_, peft_model_ids, dataset_filepaths, training_steps, diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 132c50995b..e3b6b47466 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -498,12 +498,17 @@ def compile( def generate( self, requests_or_prompts: Union[str, List[str], Request, List[Request]], - max_length: int = 128, + max_length: int = -1, + max_new_tokens: int = 128, ): """Generate tokens based on the input prompt(s) :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests :type requests_or_prompts: Union[str, List[str], Request, List[Request]] + :param max_length: The maximum length in tokens of the prompt + generated sequence, defaults to -1 (no maximum length) + :type max_length: int, optional + :param max_new_tokens: The maximum number of new tokens (excluding the prompt) to generate, defaults to 128 + :type max_new_tokens: int, optional :return: the generation results :rtype: GenerationResult """ @@ -511,7 +516,7 @@ def generate( if len(requests_or_prompts) == 0: return None return self.model.ffmodel.generate_inf_only( - [requests_or_prompts], max_length + [requests_or_prompts], max_length, max_new_tokens ) elif type(requests_or_prompts) == Request: return self.model.ffmodel.generate(requests_or_prompts) @@ -520,7 +525,7 @@ def generate( return [] if type(requests_or_prompts[0]) == str: return self.model.ffmodel.generate_inf_only( - requests_or_prompts, max_length + requests_or_prompts, max_length, max_new_tokens ) else: print(requests_or_prompts) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index c6cf656ac0..bfa60a6d54 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1683,7 +1683,8 @@ void flexflow_model_generate(flexflow_model_t handle_, enum RequestType *request_types, char const **input_texts, char **output_texts, - int *max_seq_lengths, + int *max_lengths, + int *max_new_tokens_, flexflow_peft_model_id_t *peft_model_ids, char const **dataset_filepaths, int *training_steps, @@ -1698,21 +1699,24 @@ void flexflow_model_generate(flexflow_model_t handle_, std::string const text_str(input_texts[i]); Request inference_req; inference_req.prompt = text_str; - inference_req.max_sequence_length = max_seq_lengths[i]; + inference_req.max_length = max_lengths[i]; + inference_req.max_new_tokens = max_new_tokens_[i]; PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); if (peft_model_id != nullptr) { inference_req.peft_model_id = *peft_model_id; } requests.push_back(inference_req); - DEBUG_PRINT("[Model] generate[%d] %p %s %i", + DEBUG_PRINT("[Model] generate[%d] %p %s %i %i", i, handle, text_str.c_str(), - max_seq_lengths[i]); + max_lengths[i], + max_new_tokens_[i]); } else if (request_types[i] == RequestType::REQ_FINETUNING) { Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; - fine_tuning_req.max_sequence_length = max_seq_lengths[i]; + fine_tuning_req.max_length = max_lengths[i]; + fine_tuning_req.max_new_tokens = max_new_tokens_[i]; PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); if (peft_model_id != nullptr) { fine_tuning_req.peft_model_id = *peft_model_id; @@ -1721,11 +1725,12 @@ void flexflow_model_generate(flexflow_model_t handle_, fine_tuning_req.dataset_filepath = dataset_fp; fine_tuning_req.max_training_steps = training_steps[i]; requests.push_back(fine_tuning_req); - DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i", + DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i", i, handle, dataset_fp.c_str(), - max_seq_lengths[i], + max_lengths[i], + max_new_tokens[i], training_steps[i]); } else { assert(false && "Unknown request type"); @@ -1739,8 +1744,17 @@ void flexflow_model_generate(flexflow_model_t handle_, // If the prompt exceeds max seq len, check that we return the prompt with // no additional token. Otherwise, check that the output does not exceed // the max sequence length. - assert(results[i].output_tokens.size() <= max_seq_lengths[i] || - results[i].output_tokens.size() == results[i].input_tokens.size()); + int total_tokens = results[i].output_tokens.size(); + int num_output_tokens = total_tokens - results[i].input_tokens.size(); + if (max_new_tokens_[i] >= 0) { + assert(num_output_tokens <= max_new_tokens_[i]); + } + if (max_lengths[i] >= 0) { + assert(total_tokens <= max_lengths[i] || num_output_tokens == 0); + } + // assert(results[i].output_tokens.size() <= max_seq_lengths[i] || + // results[i].output_tokens.size() == + // results[i].input_tokens.size()); output_length_and_tokens[i][0] = results[i].output_tokens.size(); std::copy(results[i].output_tokens.begin(), results[i].output_tokens.end(), diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index 681f55c998..cb140e0c75 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -224,7 +224,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index bcca1ba2c6..2d2707f10b 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -222,7 +222,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index a36d6719c9..6b371b840e 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -238,7 +238,7 @@ void inference_kernel_wrapper(LinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; if (bc->requestsInfo[i].peft_bwd) { size_t activation_size_needed = diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 3835d258e0..3832428c64 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -239,7 +239,7 @@ void inference_kernel_wrapper(LinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch; if (bc->requestsInfo[i].peft_bwd) { size_t activation_size_needed = diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp index c3c2cce3cf..eab8899167 100644 --- a/src/ops/kernels/lora_linear_kernels.cpp +++ b/src/ops/kernels/lora_linear_kernels.cpp @@ -249,7 +249,7 @@ void inference_kernel(LoraLinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != m->model_state.end()); diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 5f130782aa..93e5820f9c 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -248,7 +248,7 @@ void inference_kernel(LoraLinearMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != m->model_state.end()); diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp index 016364edfd..cbdb8ee153 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cpp +++ b/src/ops/kernels/residual_rms_norm_kernels.cpp @@ -273,7 +273,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index 0d44f0260a..285a5a5b8f 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -270,7 +270,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 4158628005..551cb72022 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -227,7 +227,7 @@ void inference_kernel_wrapper(RMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index dd6ada864d..8f59d65ea7 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -225,7 +225,7 @@ void inference_kernel_wrapper(RMSNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 27d314e21e..2fe4a85905 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -256,7 +256,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 0801d11617..b08b23819c 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -255,7 +255,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index ed973b4f71..57c9ee1418 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -283,7 +283,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index 50c81d2099..c4f5866c2f 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -281,7 +281,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index ceaa1a7788..50a358beab 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -130,7 +130,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { size_t input_tensor_size = diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 929d557a17..ca0168a59d 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -129,7 +129,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( continue; } int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (bc->requestsInfo[i].peft_bwd) { size_t input_tensor_size = diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index 4c339750c7..a4bf960a2c 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -162,8 +162,8 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) { << bc.requestsInfo[i].first_token_offset_in_batch << std::endl; os << " Number of tokens in batch: " << bc.requestsInfo[i].num_tokens_in_batch << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Max sequence length: " << bc.requestsInfo[i].max_length + << std::endl; os << " BatchConfig Req ID: " << bc.requestsInfo[i].batch_config_request_id << std::endl; os << " Prompt phase: " << bc.requestsInfo[i].prompt_phase diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc index b10f8e82ab..83e4390993 100644 --- a/src/runtime/beam_search_batch_config.cc +++ b/src/runtime/beam_search_batch_config.cc @@ -141,8 +141,8 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) { os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Max sequence length: " << bc.requestsInfo[i].max_length + << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; os << " Request running: " << bc.request_running[i] << std::endl; os << " Beam Search Specific: " << std::endl; diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 31a32dd3c8..44b181fcb3 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -54,7 +54,8 @@ std::ostream &operator<<(std::ostream &os, Request const &req) { os << "Request {\n"; os << " guid: " << req.guid << "\n"; os << " peft_model_id: " << req.peft_model_id << "\n"; - os << " max_sequence_length: " << req.max_sequence_length << "\n"; + os << " max_length: " << req.max_length << "\n"; + os << " max_new_tokens: " << req.max_new_tokens << "\n"; os << " initial_len: " << req.initial_len << "\n"; os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; os << " llm_cache_size: " << req.llm_cache_size << "\n"; @@ -261,24 +262,45 @@ RequestManager::RequestGuid Request request; request.status = Request::PENDING; request.guid = next_available_guid++; - request.max_sequence_length = request_.max_sequence_length; + request.max_length = request_.max_length; + request.max_new_tokens = request_.max_new_tokens; + if (request.max_length != -1 && request.max_new_tokens != -1) { + std::cout + << "Both `max_new_tokens` (=" << request.max_new_tokens + << ") and `max_length`(=" << request.max_length + << ") seem to have been set. `max_new_tokens` will take precedence."; + } request.peft_model_id = request_.peft_model_id; request.warmup = request_.warmup; if (bos_token_id >= 0 && model_type != ModelType::FALCON) { request.tokens.push_back(bos_token_id); } if (request_.benchmarking_tokens >= 0) { - assert(request_.benchmarking_tokens < get_max_sequence_length()); + assert(request_.benchmarking_tokens < get_max_sequence_length() && + "Benchmarking tokens exceed max sequence length"); request.benchmarking_tokens = request_.benchmarking_tokens; request.tokens.insert(request.tokens.end(), request_.benchmarking_tokens, 15); // insert random number } else { std::vector tokens = this->tokenizer_->Encode(request_.prompt); + // from here on, we will only use the max_length parameter + if (request.max_new_tokens != -1) { + request.max_length = tokens.size() + request.max_new_tokens; + } + // check that max sequence length is not exceeded + // 1. prompt itself should be less than max sequence length if (tokens.size() >= get_max_sequence_length()) { - std::cout << "Warning: too many tokens in prompt, only load up to " - << get_max_sequence_length() << " tokens, but got " - << tokens.size() << ".\n"; + std::cout << "Error: prompt (" << tokens.size() + << " tokens) exceeds max sequence length of " + << get_max_sequence_length() << ".\n"; + return INVALID_GUID; + } + // 2. max_length should not exceed the max_sequence_length + if (request.max_length >= get_max_sequence_length()) { + std::cout << "Error: max_length (" << request.max_length + << ") exceeds max sequence length of " + << get_max_sequence_length() << ".\n"; return INVALID_GUID; } for (int i = 0; i < tokens.size(); i++) { @@ -341,7 +363,18 @@ RequestManager::RequestGuid request.status = Request::PENDING; request.guid = next_available_guid++; request.initial_len = 0; - request.max_sequence_length = request_.max_sequence_length; + request.max_length = request_.max_length; + request.max_new_tokens = request_.max_new_tokens; + if (request.max_length != -1) { + std::cout << "Warning: max_length is set for PEFT finetuning, but it will " + "be ignored." + << std::endl; + } + if (request.max_new_tokens != -1) { + std::cout << "Warning: max_new_tokens is set for PEFT finetuning, but " + "it will be ignored." + << std::endl; + } request.peft_model_id = request_.peft_model_id; request.req_type = RequestType::REQ_FINETUNING; request.completed_training_steps = 0; @@ -352,7 +385,8 @@ RequestManager::RequestGuid // Load dataset if (request_.benchmarking_tokens >= 0) { - assert(request_.benchmarking_tokens <= get_max_sequence_length()); + assert(request_.benchmarking_tokens <= get_max_sequence_length() && + "Benchmarking tokens exceed max sequence length"); request.benchmarking_tokens = request_.benchmarking_tokens; std::vector input_tokens; std::vector output_tokens; @@ -385,9 +419,10 @@ RequestManager::RequestGuid this->tokenizer_->Encode(output_text); if (input_tokens.size() + output_tokens.size() > get_max_sequence_length()) { - std::cout << "Warning: too many tokens in sample, only load up to " - << get_max_sequence_length() << " tokens, but got " - << input_tokens.size() + output_tokens.size() << ".\n"; + std::cout << "Error: sample in training dataset is " + << input_tokens.size() + output_tokens.size() + << " tokens long, exceeding the maximum sequence length of " + << get_max_sequence_length() << " tokens.\n"; return INVALID_GUID; } else { request.dataset.push_back(std::make_pair(input_tokens, output_tokens)); @@ -515,7 +550,7 @@ bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc, Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; bool request_completed = false; // printf("model_type = %d\n", this->model_type); - if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) { + if (request.tokens.size() >= old_bc.requestsInfo[i].max_length) { request_completed = true; } else if (request.tokens.back() == eos_token_id) { // Encounter EOS token id @@ -698,8 +733,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id; new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; num_active_req++; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; if (new_bc.requestsInfo[i].first_token_depth_in_request + 1 == @@ -765,8 +799,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); - new_bc.requestsInfo[i].max_sequence_length = - new_request.max_sequence_length; + new_bc.requestsInfo[i].max_length = new_request.max_length; new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; @@ -932,8 +965,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_active_infr_tokens(); new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch = num_peft_tokens; - new_bc.requestsInfo[inference_batch_size].max_sequence_length = - request.max_sequence_length; + new_bc.requestsInfo[inference_batch_size].max_length = request.max_length; new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; new_bc.requestsInfo[inference_batch_size].peft_model_id = request.peft_model_id; @@ -1076,10 +1108,10 @@ BeamSearchBatchConfig verified_tokens.size()); // check if the request is finished if (verified_tokens.size() + request.tokens.size() >= - request.max_sequence_length) { + request.max_length) { // Append all verified tokens to the request for (auto const &token_pair : verified_tokens) { - if (token_pair.second < request.max_sequence_length) { + if (token_pair.second < request.max_length) { request.tokens.push_back(token_pair.first); } } @@ -1171,14 +1203,13 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; new_bc.requestsInfo[i].num_tokens_in_batch = verified_tokens.size(); new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // TODO: Beam Request Info, missing from VerifyTreeBatchConfig int new_max_depth = - new_bc.requestsInfo[i].max_sequence_length - + new_bc.requestsInfo[i].max_length - new_bc.requestsInfo[i].first_token_depth_in_request - verified_tokens.size(); new_bc.beamRequestsInfo[i].current_depth = 1; @@ -1254,8 +1285,7 @@ BeamSearchBatchConfig request.ssm_cache_size; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; new_bc.requestsInfo[i].num_tokens_in_batch = 0; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; @@ -1307,8 +1337,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].num_tokens_in_batch = std::min(get_max_tokens_per_batch() - new_bc.num_tokens, (int)new_request.tokens.size()); - new_bc.requestsInfo[i].max_sequence_length = - new_request.max_sequence_length; + new_bc.requestsInfo[i].max_length = new_request.max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // add profile_info for the new request @@ -1484,8 +1513,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; profiling_requests[request.guid].ssm_decoding_steps += 1; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata @@ -1613,8 +1641,7 @@ BeamSearchBatchConfig new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens; new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_bc.requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_bc.requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // update the beam search metadata @@ -1816,8 +1843,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = + old_batches.at(0).requestsInfo[i].max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; // copy bitmask to verify batchconfig @@ -1958,8 +1985,8 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify( new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens; new_bc.requestsInfo[i].request_guid = old_batches.at(0).requestsInfo[i].request_guid; - new_bc.requestsInfo[i].max_sequence_length = - old_batches.at(0).requestsInfo[i].max_sequence_length; + new_bc.requestsInfo[i].max_length = + old_batches.at(0).requestsInfo[i].max_length; new_bc.requestsInfo[num_active_req].batch_config_request_id = i; new_bc.request_completed[i] = false; diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc index a71b1070b2..f8ac6089fe 100644 --- a/src/runtime/tree_verify_batch_config.cc +++ b/src/runtime/tree_verify_batch_config.cc @@ -58,8 +58,8 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) { os << " PEFT Model ID: " << bc.requestsInfo[i].peft_model_id << std::endl; os << " PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl; - os << " Max sequence length: " - << bc.requestsInfo[i].max_sequence_length << std::endl; + os << " Max sequence length: " << bc.requestsInfo[i].max_length + << std::endl; os << " Request completed: " << bc.request_completed[i] << std::endl; os << " Request running: " << bc.request_running[i] << std::endl; } From dbd4cf170a6cf47d7d471db50f60d11db2fcb58f Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 10 Oct 2024 03:52:52 +0000 Subject: [PATCH 31/44] fix --- src/ops/inc_multihead_self_attention.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 2802dd41b6..454926bcdb 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -126,7 +126,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { From 2bfa56cea6fe63837b4a1e3b9ee9737236fe73a8 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 18 Oct 2024 22:56:03 -0400 Subject: [PATCH 32/44] Update LLAMA tokenizer (#1524) * fix tokenizer conversion * update * update * update * fix * fix * lint * simplify api * fix * fix * fix * update to 12.1 (#1512) * fix deadlock? * remove barrier where not strictly needed --------- Co-authored-by: zhihao --- .github/workflows/gpu-ci.yml | 8 +- cmake/nccl.cmake | 11 +- docker/flexflow-environment/Dockerfile | 24 ++-- .../ops/kernels/lora_linear_kernels.h | 11 +- include/flexflow/optimizer.h | 11 +- include/flexflow/request_manager.h | 3 +- inference/peft/peft.cc | 2 +- inference/python/ff_peft.py | 3 +- inference/python/incr_decoding.py | 17 ++- inference/python/spec_infer.py | 24 +++- python/flexflow/core/flexflow_cffi.py | 123 +++++++++++++----- python/flexflow/serve/serve.py | 100 +++++++++++--- src/ops/fused.cc | 2 - src/ops/fused.cpp | 11 +- src/ops/fused.cu | 11 +- src/ops/inc_multihead_self_attention.cpp | 2 +- src/ops/kernels/lora_linear_kernels.cu | 20 ++- src/ops/lora_linear.cc | 3 +- src/ops/spec_inc_multihead_self_attention.cc | 6 +- src/ops/tree_inc_multihead_self_attention.cc | 6 +- src/parallel_ops/allreduce.cc | 4 + src/parallel_ops/parallel_identity.cc | 5 +- src/runtime/model.cc | 30 +++-- src/runtime/optimizer.cc | 5 +- src/runtime/optimizer_kernel.cpp | 12 +- src/runtime/optimizer_kernel.cu | 12 +- src/runtime/request_manager.cc | 66 ++++++---- .../python_test_configs/generate_configs.py | 1 + 28 files changed, 378 insertions(+), 155 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 6ca50027d1..9ee4693f91 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -56,7 +56,7 @@ jobs: CONDA: "3" needs: gpu-ci-concierge container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest options: --gpus all --shm-size=8192m steps: - name: Keep alive @@ -75,7 +75,7 @@ jobs: CONDA: "3" needs: gpu-ci-concierge container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -151,7 +151,7 @@ jobs: HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} needs: gpu-ci-concierge container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version @@ -239,7 +239,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake index 82cf3b4122..abb4864588 100644 --- a/cmake/nccl.cmake +++ b/cmake/nccl.cmake @@ -36,11 +36,12 @@ if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR) string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES}) string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2}) set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}") - if(NCCL_VERSION VERSION_LESS 2.23) - set(NCCL_OLD TRUE) - else() - set(NCCL_OLD FALSE) - endif() + set(NCCL_OLD FALSE) + # if(NCCL_VERSION VERSION_LESS 2.23) + # set(NCCL_OLD TRUE) + # else() + # set(NCCL_OLD FALSE) + # endif() message(STATUS "Found NCCL version: ${NCCL_VERSION}") else() message(WARNING "NCCL header not found, unable to determine version") diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index ee13a07375..7028fc4b2e 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -55,18 +55,18 @@ ENV CUDA_DIR /usr/local/cuda ARG FF_GPU_BACKEND "cuda" # Update NCCL if FF_GPU_BACKEND is cuda -RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \ - echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \ - ubuntu_version=$(lsb_release -rs); \ - ubuntu_version=${ubuntu_version//./}; \ - wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \ - DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \ - DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \ - rm -f cuda-keyring_1.0-1_all.deb; \ - DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \ - else \ - echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \ - fi' +# RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \ +# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \ +# ubuntu_version=$(lsb_release -rs); \ +# ubuntu_version=${ubuntu_version//./}; \ +# wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \ +# DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \ +# DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \ +# rm -f cuda-keyring_1.0-1_all.deb; \ +# DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \ +# else \ +# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \ +# fi' # Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index 5360b5f8ea..eee9875d30 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -8,7 +8,8 @@ #include "flexflow/ops/lora_linear.h" namespace FlexFlow { - +using Legion::Context; +using Legion::Runtime; struct LoraLinearWeight { // weights void *w0_ptr, *w1_ptr; @@ -46,7 +47,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void peft_bwd_kernel_wrapper(LoraLinearMeta *m, +void peft_bwd_kernel_wrapper(Context ctx, + Runtime *runtime, + LoraLinearMeta *m, BatchConfig const *bc, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad); @@ -63,7 +66,9 @@ void inference_kernel(LoraLinearMeta *m, int out_dim, ffStream_t stream); template -void peft_bwd_kernel(LoraLinearMeta *m, +void peft_bwd_kernel(Context ctx, + Runtime *runtime, + LoraLinearMeta *m, BatchConfig const *bc, DT *input_grad_ptr, DT const *output_grad_ptr, diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h index bab7e6e4ed..4917df73c3 100644 --- a/include/flexflow/optimizer.h +++ b/include/flexflow/optimizer.h @@ -20,7 +20,8 @@ #include "legion.h" namespace FlexFlow { - +using Legion::Context; +using Legion::Runtime; class FFModel; class OpMeta; @@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void nccl_update_task_gpu(SGDOptimizer const *op, + static void nccl_update_task_gpu(Context ctx, + Runtime *runtime, + SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - static void nccl_update_task_gpu(AdamOptimizer const *op, + static void nccl_update_task_gpu(Context ctx, + Runtime *runtime, + AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 36a56012fc..94bfc74244 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -68,7 +68,7 @@ struct Request { BatchConfig::RequestGuid guid; PEFTModelID peft_model_id = PEFTModelID::NO_ID; int max_length = -1; - int max_new_tokens = 128; + int max_new_tokens = -1; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; @@ -302,6 +302,7 @@ class RequestManager { ModelType model_type; int bos_token_id; int eos_token_id; + bool old_llama_tokenizer = false; std::string output_filepath; std::queue pending_infr_request_queue; std::queue pending_peft_request_queue; diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index ee5bd1b460..14fc653eba 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task, printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str()); Request inference_req; inference_req.prompt = text; - inference_req.max_length = 128; + inference_req.max_new_tokens = 128; inference_req.peft_model_id = (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID; requests.push_back(inference_req); diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index a7d38a66b6..13da7aee20 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -162,7 +162,7 @@ def main(): ff.Request( ff.RequestType.REQ_INFERENCE, prompt=prompt, - max_sequence_length=128, + max_new_tokens=128, peft_model_id=llm.get_ff_peft_id(lora_inference_config), ) for prompt in prompts @@ -172,7 +172,6 @@ def main(): if len(configs.finetuning_dataset) > 0: finetuning_request = ff.Request( ff.RequestType.REQ_FINETUNING, - max_sequence_length=128, peft_model_id=llm.get_ff_peft_id(lora_finetuning_config), dataset_filepath=configs.finetuning_dataset, max_training_steps=2, diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 1df5a05a8f..232ef1699c 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -51,12 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 8 * 1024, # 8GB + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, "enable_peft": False, - "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -71,6 +71,7 @@ def get_configs(): "full_precision": False, "prompt": "", "output_file": "", + "max_length": 128, } # Merge dictionaries ff_init_configs.update(llm_configs) @@ -106,9 +107,9 @@ def main(): max_seq_length=256, max_tokens_per_batch=64, ) - + llm.start_server() - + if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] if "max_length" not in configs_dict: @@ -119,8 +120,10 @@ def main(): if "max_length" not in configs_dict: result = llm.generate("Three tips for staying healthy are: ") else: - result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length) - + result = llm.generate( + "Three tips for staying healthy are: ", max_length=configs.max_length + ) + llm.stop_server() diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 39529abda3..7ae752cffc 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -51,12 +51,12 @@ def get_configs(): "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 2, "offload": False, - "offload_reserve_space_size": 8 * 1024, # 8GB + "offload_reserve_space_size": 8 * 1024, # 8GB "use_4bit_quantization": False, "use_8bit_quantization": False, "enable_peft": False, - "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -81,6 +81,7 @@ def get_configs(): ], "prompt": "", "output_file": "", + "max_length": 128, } # Merge dictionaries ff_init_configs.update(llm_configs) @@ -144,17 +145,26 @@ def main(): max_tokens_per_batch=64, ssms=ssms, ) - + llm.start_server() if len(configs.prompt) > 0: prompts = [s for s in json.load(open(configs.prompt))] - results = llm.generate(prompts) + if "max_length" not in configs_dict: + results = llm.generate(prompts) + else: + results = llm.generate(prompts, max_length=configs.max_length) else: - result = llm.generate("Three tips for staying healthy are: ") - + if "max_length" not in configs_dict: + result = llm.generate("Three tips for staying healthy are: ") + else: + result = llm.generate( + "Three tips for staying healthy are: ", max_length=configs.max_length + ) + llm.stop_server() + if __name__ == "__main__": print("flexflow inference example (speculative inference)") main() diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 9b35b249d9..e2240f0b4f 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1795,7 +1795,7 @@ def __init__( raise ValueError( "Target modules can only be specified when trainable=True" ) - + # Check rank, lora_alpha, lora_dropout values if rank is not None or lora_alpha is not None or lora_dropout is not None: if not trainable or not init_lora_weights: @@ -1805,7 +1805,7 @@ def __init__( rank = rank if rank is not None else 8 lora_alpha = lora_alpha if lora_alpha is not None else 8.0 lora_dropout = lora_dropout if lora_dropout is not None else 0.0 - + # If passed, check if the values of rank, lora_alpha, and lora_dropout are valid if rank < 1 or type(rank) != int: raise ValueError("Rank must be >= 1 and an integer") @@ -1813,7 +1813,7 @@ def __init__( raise ValueError("Lora_alpha must be > 0") if lora_dropout < 0 or lora_dropout > 1: raise ValueError("Lora_dropout must be in the interval [0, 1]") - + self.ff_initialized = False self._cache_folder = cache_folder self._peft_model_id = peft_model_id @@ -2051,13 +2051,15 @@ def no_id_handle(): # Request # ----------------------------------------------------------------------- + @dataclass class Request: """A class to record the metadata of an inference or finetuning request.""" + req_type: RequestType prompt: Optional[str] = None max_length: int = -1 - max_new_tokens: int = 128 + max_new_tokens: int = -1 peft_model_id: Optional[PEFTModelID] = None dataset_filepath: Optional[str] = None max_training_steps: int = 1 @@ -4650,26 +4652,65 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_new_tokens: int = 128): + def _estimate_max_num_tokens( + max_length: int, max_new_tokens: int, prompt: Optional[str] + ): + if prompt is None: + assert max_new_tokens == -1 + return ( + math.ceil(max_new_tokens + len(prompt.split()) * 1.5) + if max_new_tokens != -1 + else max_length + ) + + def _estimate_max_num_chars( + max_length: int, max_new_tokens: int, prompt: Optional[str] + ): + return ( + 5 * FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt) + + 100 + ) + + # deprecated + def generate_inf_only( + self, + prompt_list: List[str], + max_length: int, + max_new_tokens: int, + ): if max_length != -1 and max_new_tokens != -1: - warnings.warn(f"Both `max_new_tokens` (={self.max_new_tokens}) and `max_length`(={self.max_length}) seem to have been set. `max_new_tokens` will take precedence.") + raise ValueError( + f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) seem to have been set." + ) + if max_length == -1 and max_new_tokens == -1: + raise ValueError( + f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) were left unset." + ) assert isinstance(prompt_list, list) c_input_texts = [get_c_name(prompt) for prompt in prompt_list] - estimated_max_tokens = math.ceil(max_new_tokens + max([len(prompt.split()) for prompt in prompt_list])*1.5) if max_new_tokens != -1 else max_length - max_num_chars = 5 * (estimated_max_tokens + 100) - c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list] + c_output_texts = [ + ffi.new( + "char[]", + FFModel._estimate_max_num_chars(max_length, max_new_tokens, prompt), + ) + for prompt in prompt_list + ] c_output_length_and_tokens = [ - ffi.new("int[]", estimated_max_tokens + 100) for prompt in prompt_list + ffi.new( + "int[]", + FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt) + + 100, + ) + for prompt in prompt_list ] c_request_types = [ - enum_to_int(RequestType, RequestType.REQ_INFERENCE) - for prompt in prompt_list + enum_to_int(RequestType, RequestType.REQ_INFERENCE) for _ in prompt_list ] - max_lengths = [max_length for prompt in prompt_list] - max_new_tokens_ = [max_new_tokens for prompt in prompt_list] - peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list] - dataset_filepaths = [ffi.NULL for prompt in prompt_list] - training_steps = [0 for prompt in prompt_list] + max_lengths = [max_length for _ in prompt_list] + max_new_tokens_ = [max_new_tokens for _ in prompt_list] + peft_model_ids = [PEFTModelID.no_id_handle() for _ in prompt_list] + dataset_filepaths = [ffi.NULL for _ in prompt_list] + training_steps = [0 for _ in prompt_list] num_finetuning_losses = ffi.new("int *") c_finetuning_losses = ffi.new("float[]", 0) ffc().flexflow_model_generate( @@ -4698,34 +4739,55 @@ def generate_inf_only(self, prompt_list: List[str], max_length: int = -1, max_ne def generate(self, requests_list: List[Request]): assert isinstance(requests_list, list) + for request in requests_list: + assert isinstance(request, Request) + if request.max_length != -1 and request.max_new_tokens != -1: + raise ValueError( + f"Both `max_new_tokens` (={request.max_new_tokens}) and `max_length`(={request.max_length}) seem to have been set." + ) + if request.max_length == -1 and request.max_new_tokens == -1: + raise ValueError( + f"Both `max_new_tokens` (={request.max_new_tokens}) and `max_length`(={request.max_length}) were left unset." + ) + if ( + request.req_type == RequestType.REQ_FINETUNING + and request.max_new_tokens != -1 + ): + raise ValueError( + f"Finetuning requests should not have `max_new_tokens` set." + ) c_input_texts = [ get_c_name(request.prompt) for request in requests_list ] # entry will be None for finetuning requests c_output_texts = [ ( - ffi.new("char[]", 5 * (request.max_sequence_length + 100)) + ffi.new( + "char[]", + FFModel._estimate_max_num_chars( + request.max_length, request.max_new_tokens, request.prompt + ), + ) if request.req_type == RequestType.REQ_INFERENCE else ffi.NULL ) for request in requests_list ] c_output_length_and_tokens = [ - ffi.new("int[]", request.max_sequence_length + 100) + ffi.new( + "int[]", + FFModel._estimate_max_num_tokens( + request.max_length, request.max_new_tokens, request.prompt + ) + + 100, + ) for request in requests_list ] c_request_types = [ enum_to_int(RequestType, request.req_type) for request in requests_list ] - max_lengths = [ - request.max_length for request in requests_list - ] - max_new_tokens_ = [ - request.max_new_tokens for request in requests_list - ] - for i in range(len(requests_list)): - if max_lengths[i] != -1 and max_new_tokens_[i] != -1: - warnings.warn(f"Both `max_new_tokens` (={max_new_tokens_[i]}) and `max_length`(={max_lengths[i]}) seem to have been set. `max_new_tokens` will take precedence.") - + max_lengths = [request.max_length for request in requests_list] + max_new_tokens_ = [request.max_new_tokens for request in requests_list] + peft_model_ids = [ ( request.peft_model_id @@ -4742,7 +4804,7 @@ def generate(self, requests_list: List[Request]): # c_finetuning_losses = ffi.new("float**") # TODO: set this value automatically c_finetuning_losses = ffi.new("float[]", 10000) - + ffc().flexflow_model_generate( self.handle, len(requests_list), @@ -4774,7 +4836,6 @@ def generate(self, requests_list: List[Request]): finetuning_losses=finetuning_losses, ) ) - return results def set_position_offset(self, offset): ffc().flexflow_model_set_position_offset(self.handle, offset) diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index e3b6b47466..c8540a6ed3 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -27,15 +27,18 @@ MPTConfig, ) from flexflow.core import * -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer +from transformers import AutoConfig, AutoModelForCausalLM from peft import PeftModel, PeftConfig, LoraConfig from huggingface_hub import HfApi import torch, shutil, hashlib, json, gc from typing import Union, List +from huggingface_hub import snapshot_download class _SupportedModels: - def __init__(self,): + def __init__( + self, + ): self.supported_models = { "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig), @@ -292,8 +295,8 @@ def download_peft_weights(): weights_path = get_weights_path(peft_model_id) refresh_cache_if_needed(peft_model_id) - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - peft_model_id, weights_path + ff_revision, ff_revision_file, latest_revision = ( + self.__get_revision_hashes(peft_model_id, weights_path) ) if ff_revision != latest_revision: @@ -349,10 +352,25 @@ def download_hf_tokenizer_if_needed(self): print( f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." ) - # Download tokenizer from HuggingFace, or load it from the local folder - hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - # Save tokenizer - hf_tokenizer.save_pretrained(self.tokenizer_path) + # Load/download the tokenizer files + target_tokenizer_files = [ + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + ] + if os.path.exists(self.model_name): + hf_tokenizer_path = self.model_name + else: + hf_tokenizer_path = snapshot_download( + repo_id=self.model_name, allow_patterns=target_tokenizer_files + ) + for file in target_tokenizer_files: + src_path = os.path.join(hf_tokenizer_path, file) + dst_path = os.path.join(self.tokenizer_path, file) + if os.path.exists(src_path): + shutil.copy(src_path, dst_path) print("Done updating HF tokenizer.") # Save new revision hash to file with open(ff_revision_file, "w+") as f: @@ -417,6 +435,8 @@ def compile( model_specific_pipeline_parallelism_degree ) + self.max_seq_length = max_seq_length + # Create request manager and set serving configuration self.rm = RequestManager() self.rm.set_max_requests_per_batch(max_requests_per_batch) @@ -495,11 +515,44 @@ def compile( atexit.register(self.rm.stop_server) + def _generate(self, requests: List[Request]): + if len(requests) == 0: + return [] + for req in requests: + if req.req_type == RequestType.REQ_INFERENCE: + # check max_length and max_new_tokens parameters + if req.max_length == -1 and req.max_new_tokens == -1: + req.max_length = self.max_seq_length -1 + elif req.max_length != -1 and req.max_new_tokens != -1: + warnings.warn( + f"Both `max_new_tokens` (={req.max_new_tokens}) and `max_length`(={req.max_length}) seem to have been set. `max_new_tokens` will take precedence." + ) + req.max_length = -1 + if ( + req.max_length >= self.max_seq_length + or req.max_new_tokens >= self.max_seq_length + ): + raise ValueError( + f"max_length ({req.max_length}) or max_new_tokens ({req.max_new_tokens}) exceeds the maximum sequence length ({self.max_seq_length})" + ) + else: + if req.max_new_tokens != -1: + raise ValueError( + f"max_new_tokens ({req.max_new_tokens}) is not allowed for finetuning requests." + ) + if req.max_length == -1: + req.max_length = self.max_seq_length -1 + if req.max_length >= self.max_seq_length: + raise ValueError( + f"max_length ({req.max_length}) exceeds the maximum sequence length ({self.max_seq_length})" + ) + return self.model.ffmodel.generate(requests) + def generate( self, requests_or_prompts: Union[str, List[str], Request, List[Request]], max_length: int = -1, - max_new_tokens: int = 128, + max_new_tokens: int = -1, ): """Generate tokens based on the input prompt(s) @@ -514,24 +567,35 @@ def generate( """ if type(requests_or_prompts) == str: if len(requests_or_prompts) == 0: - return None - return self.model.ffmodel.generate_inf_only( - [requests_or_prompts], max_length, max_new_tokens + return [] + request = Request( + req_type=RequestType.REQ_INFERENCE, + prompt=requests_or_prompts, + max_length=max_length, + max_new_tokens=max_new_tokens, ) + return self._generate([request]) elif type(requests_or_prompts) == Request: - return self.model.ffmodel.generate(requests_or_prompts) + return self._generate([requests_or_prompts]) elif type(requests_or_prompts) == list: if len(requests_or_prompts) == 0: return [] if type(requests_or_prompts[0]) == str: - return self.model.ffmodel.generate_inf_only( - requests_or_prompts, max_length, max_new_tokens - ) + requests = [ + Request( + req_type=RequestType.REQ_INFERENCE, + prompt=req, + max_length=max_length, + max_new_tokens=max_new_tokens, + ) + for req in requests_or_prompts + ] + return self._generate(requests) else: print(requests_or_prompts) - return self.model.ffmodel.generate(requests_or_prompts) + return self._generate(requests_or_prompts) else: - assert False, "Please pass a non-empty string or list of strings" + assert False, "Please pass a string, list of strings, Request, or list of Requests" def start_server(self): self.rm.start_server(self.model.ffmodel) diff --git a/src/ops/fused.cc b/src/ops/fused.cc index 720d678a4a..984691fa66 100644 --- a/src/ops/fused.cc +++ b/src/ops/fused.cc @@ -476,7 +476,6 @@ void FusedOp::init(FFModel const &ff) { false /*must*/, 0 /*mapper_id*/, outputs[0]->machine_view.hash()); - launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { @@ -571,7 +570,6 @@ void FusedOp::init_inference(FFModel const &ff, false /*must*/, 0 /*mapper_id*/, machine_view_hash); - launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, launcher); fm.wait_all_results(); switch (domain.get_dim()) { diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 2cede662f3..dfb524d206 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -612,8 +612,10 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + runtime->concurrent_task_barrier(ctx); Kernels::AllReduce::inference_kernel_wrapper( m, bc, my_input_accessor[0], my_output_accessor[0]); + runtime->concurrent_task_barrier(ctx); break; } case OP_PARALLEL_IDENTITY: { @@ -870,7 +872,12 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, // since we ``inplace'' the output for LoRA assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); Kernels::LoraLinear::peft_bwd_kernel_wrapper( - m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + ctx, + runtime, + m, + bc, + my_input_grad_accessor[0], + my_output_grad_accessor[0]); break; } case OP_BATCHMATMUL: { @@ -1129,8 +1136,10 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + runtime->concurrent_task_barrier(ctx); Kernels::ParallelIdentity::peft_bwd_kernel_wrapper( m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + runtime->concurrent_task_barrier(ctx); break; } default: { diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 5aed2cd69a..62845c0f8e 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -623,8 +623,10 @@ __host__ void assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; + runtime->concurrent_task_barrier(ctx); Kernels::AllReduce::inference_kernel_wrapper( m, bc, my_input_accessor[0], my_output_accessor[0]); + runtime->concurrent_task_barrier(ctx); break; } case OP_PARALLEL_IDENTITY: { @@ -888,7 +890,12 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, // since we ``inplace'' the output for LoRA assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); Kernels::LoraLinear::peft_bwd_kernel_wrapper( - m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + ctx, + runtime, + m, + bc, + my_input_grad_accessor[0], + my_output_grad_accessor[0]); break; } case OP_BATCHMATMUL: { @@ -1149,8 +1156,10 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; + runtime->concurrent_task_barrier(ctx); Kernels::ParallelIdentity::peft_bwd_kernel_wrapper( m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + runtime->concurrent_task_barrier(ctx); break; } default: { diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index a4604a11a2..8818cd9673 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -147,7 +147,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_sequence_length; + int max_peft_tokens = bc->requestsInfo[i].max_length; // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].peft_bwd) { diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 93e5820f9c..638cee8cae 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -96,7 +96,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m, } } -void peft_bwd_kernel_wrapper(LoraLinearMeta *m, +void peft_bwd_kernel_wrapper(Context ctx, + Runtime *runtime, + LoraLinearMeta *m, BatchConfig const *bc, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { @@ -111,7 +113,9 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m, int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; if (m->input_type[0] == DT_FLOAT) { - Internal::peft_bwd_kernel(m, + Internal::peft_bwd_kernel(ctx, + runtime, + m, bc, input_grad.get_float_ptr(), output_grad.get_float_ptr(), @@ -119,7 +123,9 @@ void peft_bwd_kernel_wrapper(LoraLinearMeta *m, out_dim, stream); } else if (m->input_type[0] == DT_HALF) { - Internal::peft_bwd_kernel(m, + Internal::peft_bwd_kernel(ctx, + runtime, + m, bc, input_grad.get_half_ptr(), output_grad.get_half_ptr(), @@ -361,7 +367,9 @@ __global__ void sgd_update(size_t count, } template -void peft_bwd_kernel(LoraLinearMeta *m, +void peft_bwd_kernel(Context ctx, + Runtime *runtime, + LoraLinearMeta *m, BatchConfig const *bc, DT *input_grad_ptr, DT const *output_grad_ptr, @@ -543,13 +551,15 @@ void peft_bwd_kernel(LoraLinearMeta *m, // and sum first #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]); - checkCUDA(ncclAllReduce(static_cast
(weight.w1_grad_ptr), + runtime->concurrent_task_barrier(ctx); + checkNCCL(ncclAllReduce(static_cast
(weight.w1_grad_ptr), static_cast
(weight.w1_grad_ptr), w1_num_elements, nccl_data_type, ncclSum, m->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); #else assert(false && "Must enable FF_USE_NCCL to use AllReduce operators"); #endif diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 513147f3b7..3749cce994 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -296,7 +296,6 @@ void LoraLinear::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); - launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -1066,7 +1065,7 @@ void LoraLinear::peft_bwd_task(Task const *task, int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); - peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + peft_bwd_kernel_wrapper(ctx, runtime, m, bc, input_grad, output_grad); save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id); diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc index aa74ecc6f5..6b2a4be507 100644 --- a/src/ops/spec_inc_multihead_self_attention.cc +++ b/src/ops/spec_inc_multihead_self_attention.cc @@ -170,7 +170,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( Layer const *layer, std::vector const &inputs) { - std::cout << "spec create operator: " << layer->name << "\n"; + // std::cout << "spec create operator: " << layer->name << "\n"; long long value; layer->get_int_property("embed_dim", value); int embed_dim = value; @@ -182,10 +182,10 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer( int kdim = value; layer->get_int_property("vdim", value); int vdim = value; - float dropout; - layer->get_float_property("dropout", dropout); layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + float dropout; + layer->get_float_property("dropout", dropout); RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); rotary_embedding_meta.apply_rotary_embedding = (bool)value; diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc index ae0795ac1e..ac0011d9eb 100644 --- a/src/ops/tree_inc_multihead_self_attention.cc +++ b/src/ops/tree_inc_multihead_self_attention.cc @@ -163,6 +163,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify( rotary_embedding_meta.original_max_position_embeddings); li->add_int_property("scaling_query", scaling_query); li->add_float_property("scaling_factor", scaling_factor); + li->add_int_property("qk_prod_scaling", qk_prod_scaling); li->add_int_property("position_bias", position_bias); li->add_int_property("quantization_type", quantization_type); li->add_int_property("offload", offload); @@ -187,10 +188,10 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( int kdim = value; layer->get_int_property("vdim", value); int vdim = value; - float dropout; - layer->get_float_property("dropout", dropout); layer->get_int_property("add_zero_attn", value); bool add_zero_attn = (bool)value; + float dropout; + layer->get_float_property("dropout", dropout); RotaryEmbeddingMeta rotary_embedding_meta; layer->get_int_property("apply_rotary_embedding", value); rotary_embedding_meta.apply_rotary_embedding = (bool)value; @@ -203,6 +204,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer( rotary_embedding_meta.high_freq_factor); layer->get_int_property("original_max_position_embeddings", value); rotary_embedding_meta.original_max_position_embeddings = (int)value; + layer->get_int_property("scaling_query", value); bool scaling_query = (bool)value; float scaling_factor; layer->get_float_property("scaling_factor", scaling_factor); diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index a4443c4066..6611a6bb1f 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -197,7 +197,9 @@ void AllReduce::forward_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input.data_type == output.data_type); + // runtime->concurrent_task_barrier(ctx); forward_kernel_wrapper(m, input, output); + // runtime->concurrent_task_barrier(ctx); } void AllReduce::backward(FFModel const &ff) { @@ -347,7 +349,9 @@ void AllReduce::inference_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input.data_type == output.data_type); + // runtime->concurrent_task_barrier(ctx); inference_kernel_wrapper(m, bc, input, output); + // runtime->concurrent_task_barrier(ctx); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc index 7d68036709..2f76897712 100644 --- a/src/parallel_ops/parallel_identity.cc +++ b/src/parallel_ops/parallel_identity.cc @@ -245,7 +245,9 @@ void ParallelIdentity::backward_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input_grad.data_type == output_grad.data_type); + // runtime->concurrent_task_barrier(ctx); backward_kernel_wrapper(m, input_grad, output_grad); + // runtime->concurrent_task_barrier(ctx); } void ParallelIdentity::init_inference( @@ -270,7 +272,6 @@ void ParallelIdentity::init_inference( false /*must*/, 0 /*mapper_id*/, machine_view_hash); - launcher.concurrent = true; launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part, 0 /*projection id*/, READ_ONLY, @@ -422,7 +423,9 @@ void ParallelIdentity::peft_bwd_task(Task const *task, m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime); assert(input_grad.data_type == output_grad.data_type); + // runtime->concurrent_task_barrier(ctx); peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad); + // runtime->concurrent_task_barrier(ctx); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 69fe3b598d..417cd2c056 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1677,6 +1677,7 @@ void FFModel::finish_nccl_comms() { false /*must*/, 0 /*mapper_id*/, comm.first); + index_launcher.concurrent = true; FutureMap fm = runtime->execute_index_space(ctx, index_launcher); fm.wait_all_results(); } @@ -6899,7 +6900,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "LoraLinear Init Task"); @@ -6932,6 +6932,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "LoraLinear PEFT Backward Task"); @@ -6963,7 +6964,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(FUSEDOP_INIT_TASK_ID, "FusedOp Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Init Task"); @@ -6979,6 +6979,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Inference Task"); @@ -6995,6 +6996,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp PEFT Backward Task"); @@ -7011,6 +7013,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Forward Task"); @@ -7026,6 +7029,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedOp Backward Task"); @@ -7262,7 +7266,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_INIT_TASK_ID, "AllReduce Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce init Task"); @@ -7280,6 +7283,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, // AllReduce forward and backward must run concurrently since they // use ncclAllReduce internally registrar.set_concurrent(); + // registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Forward Task"); @@ -7294,9 +7298,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - // AllReduce forward and backward must run concurrently since they - // use ncclAllReduce internally - // registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Backward Task"); @@ -7315,6 +7316,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, // AllReduce forward and backward must run concurrently since they // use ncclAllReduce internally registrar.set_concurrent(); + // registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce Inference Task"); @@ -7330,9 +7332,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, "AllReduce PEFT Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - // AllReduce forward and backward must run concurrently since they - // use ncclAllReduce internally - // registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "AllReduce PEFT Backward Task"); @@ -7349,7 +7348,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, "ParallelIdentity Init"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); - registrar.set_concurrent(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity init Task"); @@ -7382,6 +7380,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + // registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity Backward Task"); @@ -7415,6 +7414,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + // registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "ParallelIdentity PEFT Backward Task"); @@ -7433,6 +7433,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, "FusedParallel Forward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedParallel Forward Task"); @@ -7448,6 +7450,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, "FusedParallel Backward"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "FusedParallel Backward Task"); @@ -7496,6 +7500,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "SGD NCCL Update Task", 111 /*variant ID*/); @@ -7511,6 +7516,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, TaskVariantRegistrar registrar(ADAM_UPD_NCCL_TASK_ID, "Adam NCCL Update"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "Adam NCCL Update Task", 111 /*variant ID*/); @@ -7648,6 +7655,7 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); registrar.set_concurrent(); + // registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "NCCL Init Communicators Task", 111 /*variant ID*/); @@ -7664,6 +7672,8 @@ void register_flexflow_internal_tasks(Runtime *runtime, "NCCL Finish Communicators"); registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); registrar.set_leaf(); + registrar.set_concurrent(); + // registrar.set_concurrent_barrier(); if (pre_register) { Runtime::preregister_task_variant( registrar, "NCCL Finish Communicators Task", 111 /*variant ID*/); diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index c42a0c9aa6..96b735803c 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -311,7 +311,7 @@ void SGDOptimizer::nccl_update_task(Task const *task, } } - nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr); + nccl_update_task_gpu(ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr); } #endif @@ -603,7 +603,8 @@ void AdamOptimizer::nccl_update_task(Task const *task, } } - nccl_update_task_gpu(op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); + nccl_update_task_gpu( + ctx, runtime, op, meta, w_grad_ptr, size, w_ptr, v_ptr, m_ptr); } #endif diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 59efaf5256..9b0d3c8892 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -86,7 +86,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, +__host__ void SGDOptimizer::nccl_update_task_gpu(Context ctx, + Runtime *runtime, + SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -96,6 +98,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -103,6 +106,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // Step 2: SGD update @@ -208,7 +212,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, +__host__ void AdamOptimizer::nccl_update_task_gpu(Context ctx, + Runtime *runtime, + AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -218,6 +224,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients hipStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -225,6 +232,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/src/runtime/optimizer_kernel.cu b/src/runtime/optimizer_kernel.cu index df37e3b135..72ee74940f 100644 --- a/src/runtime/optimizer_kernel.cu +++ b/src/runtime/optimizer_kernel.cu @@ -75,7 +75,9 @@ __host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, +__host__ void SGDOptimizer::nccl_update_task_gpu(Context ctx, + Runtime *runtime, + SGDOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -85,6 +87,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -92,6 +95,7 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -183,7 +187,9 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, } #ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, +__host__ void AdamOptimizer::nccl_update_task_gpu(Context ctx, + Runtime *runtime, + AdamOptimizer const *op, OpMeta const *meta, float const *w_grad_ptr, size_t size, @@ -193,6 +199,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, // Use NCCL to sync gradients cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); + runtime->concurrent_task_barrier(ctx); checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, size, @@ -200,6 +207,7 @@ __host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, ncclSum, meta->handle.ncclComm, stream)); + runtime->concurrent_task_barrier(ctx); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 44b181fcb3..5fbee65e6d 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -186,28 +186,35 @@ void RequestManager::register_tokenizer(ModelType type, std::filesystem::path tokenizer_folder(path); if (model_type == ModelType::LLAMA) { - std::filesystem::path tokenizer_model_path; + // try with tokenizer.json first + std::filesystem::path tokenizer_json_path; if (std::filesystem::is_directory(tokenizer_folder)) { - tokenizer_model_path = - std::filesystem::path(tokenizer_folder) / "tokenizer.model"; + tokenizer_json_path = + std::filesystem::path(tokenizer_folder) / "tokenizer.json"; } else { - tokenizer_model_path = tokenizer_folder; + tokenizer_json_path = tokenizer_folder; } - if (std::filesystem::exists(tokenizer_model_path)) { - // load from tokenizer.model - this->tokenizer_ = Tokenizer::FromBlobSentencePiece( - LoadBytesFromFile(tokenizer_model_path.string())); - } else { + if (std::filesystem::exists(tokenizer_json_path)) { // load from tokenizer.json - std::filesystem::path tokenizer_json_path = - tokenizer_folder / "tokenizer.json"; - if (!std::filesystem::exists(tokenizer_json_path)) { - std::cerr << "Failed to open file: " << tokenizer_json_path + this->tokenizer_ = Tokenizer::FromBlobJSON( + LoadBytesFromFile(tokenizer_json_path.string())); + } else { + // load from tokenizer.model + std::filesystem::path tokenizer_model_path; + if (std::filesystem::is_directory(tokenizer_folder)) { + tokenizer_model_path = + std::filesystem::path(tokenizer_folder) / "tokenizer.model"; + } else { + tokenizer_model_path = tokenizer_folder; + } + if (!std::filesystem::exists(tokenizer_model_path)) { + std::cerr << "Failed to open file: " << tokenizer_model_path << std::endl; assert(false); } - this->tokenizer_ = Tokenizer::FromBlobJSON( - LoadBytesFromFile(tokenizer_json_path.string())); + old_llama_tokenizer = true; + this->tokenizer_ = Tokenizer::FromBlobSentencePiece( + LoadBytesFromFile(tokenizer_model_path.string())); } } else if (model_type == ModelType::OPT) { std::filesystem::path vocab_file = tokenizer_folder / "vocab.json"; @@ -264,7 +271,13 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_length = request_.max_length; request.max_new_tokens = request_.max_new_tokens; + // both unset + if (request.max_length == -1 && request.max_new_tokens == -1) { + request.max_length = get_max_sequence_length() - 1; + } + // both set if (request.max_length != -1 && request.max_new_tokens != -1) { + request.max_length = -1; std::cout << "Both `max_new_tokens` (=" << request.max_new_tokens << ") and `max_length`(=" << request.max_length @@ -365,15 +378,14 @@ RequestManager::RequestGuid request.initial_len = 0; request.max_length = request_.max_length; request.max_new_tokens = request_.max_new_tokens; - if (request.max_length != -1) { - std::cout << "Warning: max_length is set for PEFT finetuning, but it will " - "be ignored." - << std::endl; - } if (request.max_new_tokens != -1) { - std::cout << "Warning: max_new_tokens is set for PEFT finetuning, but " - "it will be ignored." - << std::endl; + std::cerr + << "Error: max_new_tokens is not allowed for PEFT finetuning requests" + << std::endl; + assert(false); + } + if (request.max_length == -1) { + request.max_length = get_max_sequence_length() - 1; } request.peft_model_id = request_.peft_model_id; request.req_type = RequestType::REQ_FINETUNING; @@ -660,7 +672,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token - if (model_type == ModelType::LLAMA && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1121,7 +1133,7 @@ BeamSearchBatchConfig std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token - if (model_type == ModelType::LLAMA && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1264,7 +1276,7 @@ BeamSearchBatchConfig std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token - if (model_type == ModelType::LLAMA && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1312,7 +1324,7 @@ BeamSearchBatchConfig std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically removes // the BOS token - if (model_type == ModelType::LLAMA && + if (model_type == ModelType::LLAMA && old_llama_tokenizer && request.tokens.at(0) == bos_token_id) { output = " " + output; } diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 0a745c7984..2720304d4f 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -34,6 +34,7 @@ "full_precision": True, "prompt": "", "output_file": "", + "max_length": 128, } ssm_configs = { "ssms": [ From d8355cae0197f35425f3c4164fdcdb23717ea293 Mon Sep 17 00:00:00 2001 From: zhihao Date: Sat, 19 Oct 2024 17:26:17 +0000 Subject: [PATCH 33/44] docker fix --- docker/build.sh | 21 +++++- docker/flexflow-environment/Dockerfile | 15 +++-- .../flexflow-environment/install_pytorch.sh | 67 +++++++++++++++++++ 3 files changed, 97 insertions(+), 6 deletions(-) create mode 100755 docker/flexflow-environment/install_pytorch.sh diff --git a/docker/build.sh b/docker/build.sh index b68860712f..3b7a6992df 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -1,5 +1,6 @@ #! /usr/bin/env bash set -euo pipefail +set -x # Usage: ./build.sh # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version @@ -102,7 +103,16 @@ if [[ "$python_version" != @(3.8|3.9|3.10|3.11|latest) ]]; then exit 0 fi -docker build --build-arg "ff_environment_base_image=${ff_environment_base_image}" --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "hip_version=${hip_version}" --build-arg "python_version=${python_version}" -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow-environment/Dockerfile . +docker build \ + --build-arg "ff_environment_base_image=${ff_environment_base_image}" \ + --build-arg "N_BUILD_CORES=${n_build_cores}" \ + --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" \ + --build-arg "cuda_version=${cuda_version}" \ + --build-arg "hip_version=${hip_version}" \ + --build-arg "python_version=${python_version}" \ + -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" \ + -f docker/flexflow-environment/Dockerfile \ + . # If the user only wants to build the environment image, we are done if [[ "$image" == "flexflow-environment" ]]; then @@ -162,4 +172,11 @@ fi # Set value of BUILD_CONFIGS get_build_configs -docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "gpu_backend_version=${gpu_backend_version}" -t "flexflow-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow/Dockerfile . +docker build \ + --build-arg "N_BUILD_CORES=${n_build_cores}" \ + --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" \ + --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" \ + --build-arg "gpu_backend_version=${gpu_backend_version}" \ + -t "flexflow-${FF_GPU_BACKEND}${gpu_backend_version}" \ + -f docker/flexflow/Dockerfile \ + . diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 7028fc4b2e..373331f0e7 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -4,6 +4,8 @@ FROM ${ff_environment_base_image} LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow LABEL org.opencontainers.image.description="FlexFlow environment container" +SHELL ["/bin/bash", "-c"] + # Install basic dependencies RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \ rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \ @@ -53,6 +55,8 @@ ENV CUDA_DIR /usr/local/cuda # GPU-specific dependencies ARG FF_GPU_BACKEND "cuda" +ARG cuda_version "" +ARG hip_version "5.6" # Update NCCL if FF_GPU_BACKEND is cuda # RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \ @@ -73,7 +77,6 @@ ARG FF_GPU_BACKEND "cuda" # package attempts to re-install cuda even though cuda is already installed # in the container. It also attempts to install packages for a graphical install. # For our container, we don't need `hip-runtime-nvidia` -ARG hip_version "5.6" RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \ # Check that hip_version is one of 5.3,5.4,5.5,5.6 @@ -106,9 +109,13 @@ RUN rm -rf /var/lib/apt/lists/* # Install python packages and other dependencies RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing -# Install CPU-only Pytorch and related dependencies -RUN conda install pytorch torchvision torchaudio -c pytorch -RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops +# Install Pytorch +COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytorch.sh +RUN chmod +x /usr/local/bin/install_pytorch.sh && \ + /usr/local/bin/install_pytorch.sh ${cuda_version} && \ + rm /usr/local/bin/install_pytorch.sh +# Various dependencies +RUN pip3 install transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook # PEFT-related RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft diff --git a/docker/flexflow-environment/install_pytorch.sh b/docker/flexflow-environment/install_pytorch.sh new file mode 100755 index 0000000000..144b080e23 --- /dev/null +++ b/docker/flexflow-environment/install_pytorch.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Check if CUDA version is supplied +if [ -z "$1" ]; then + echo "Please provide the CUDA version as XX.Y (e.g., 11.8)" + exit 1 +fi + +# Extract major and minor version from input +CUDA_VERSION=$1 +MAJOR_VERSION=$(echo "$CUDA_VERSION" | cut -d '.' -f 1) +MINOR_VERSION=$(echo "$CUDA_VERSION" | cut -d '.' -f 2) + +# Function to install PyTorch +install_pytorch() { + local major=$1 + local minor=$2 + + echo "Attempting to install PyTorch with CUDA ${major}.${minor} support..." + + # Run dry-run first + if pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor} --dry-run; then + echo "Dry-run succeeded, proceeding with actual installation..." + pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor} + return 0 + else + echo "Dry-run failed for CUDA ${major}.${minor}." + return 1 + fi +} + +# Try to install with provided CUDA version or lower +while [ "$MINOR_VERSION" -ge 0 ]; do + if install_pytorch "$MAJOR_VERSION" "$MINOR_VERSION"; then + echo "PyTorch installation successful with CUDA ${MAJOR_VERSION}.${MINOR_VERSION}" + exit 0 + else + # Decrease the minor version + MINOR_VERSION=$((MINOR_VERSION - 1)) + + # Abort if minor version is less than 0 (all <= input failed) + if [ "$MINOR_VERSION" -lt 0 ]; then + echo "All minor versions <= input failed. Searching for the smallest minor version." + fi + fi +done + +# Now attempt to find the smallest available minor version >= 0 +MINOR_VERSION=0 +echo "Starting search for the smallest minor version..." + +while true; do + if install_pytorch "$MAJOR_VERSION" "$MINOR_VERSION"; then + echo "PyTorch installation successful with CUDA ${MAJOR_VERSION}.${MINOR_VERSION}" + exit 0 + else + # Increase minor version to search for available one + MINOR_VERSION=$((MINOR_VERSION + 1)) + + # Stop if no valid version is found after a certain number of tries + # For practical purposes, let's assume we won't go beyond minor version 10 + if [ "$MINOR_VERSION" -gt 10 ]; then + echo "No valid PyTorch installation found for CUDA ${MAJOR_VERSION}. Aborting." + exit 1 + fi + fi +done From bf6be8cd72ef74acc4bf0e9be698dac5abd139fa Mon Sep 17 00:00:00 2001 From: zhihao Date: Sat, 19 Oct 2024 17:30:28 +0000 Subject: [PATCH 34/44] shellcheck --- docker/flexflow-environment/install_pytorch.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/flexflow-environment/install_pytorch.sh b/docker/flexflow-environment/install_pytorch.sh index 144b080e23..9fe5151877 100755 --- a/docker/flexflow-environment/install_pytorch.sh +++ b/docker/flexflow-environment/install_pytorch.sh @@ -19,9 +19,9 @@ install_pytorch() { echo "Attempting to install PyTorch with CUDA ${major}.${minor} support..." # Run dry-run first - if pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor} --dry-run; then + if pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu"${major}${minor}" --dry-run; then echo "Dry-run succeeded, proceeding with actual installation..." - pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${major}${minor} + pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu"${major}${minor}" return 0 else echo "Dry-run failed for CUDA ${major}.${minor}." From 4f6990f4ebd3c1a2cbe4e7bd3e67daa7430c6536 Mon Sep 17 00:00:00 2001 From: zhihao Date: Sat, 19 Oct 2024 17:40:30 +0000 Subject: [PATCH 35/44] update --- docker/flexflow-environment/Dockerfile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 373331f0e7..596d099f79 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -111,9 +111,15 @@ RUN rm -rf /var/lib/apt/lists/* RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing # Install Pytorch COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytorch.sh -RUN chmod +x /usr/local/bin/install_pytorch.sh && \ - /usr/local/bin/install_pytorch.sh ${cuda_version} && \ - rm /usr/local/bin/install_pytorch.sh +RUN if [ "$FF_GPU_BACKEND" == "cuda" ] ; then \ + echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing PyTorch with CUDA"; \ + chmod +x /usr/local/bin/install_pytorch.sh && \ + /usr/local/bin/install_pytorch.sh ${cuda_version} && \ + rm /usr/local/bin/install_pytorch.sh; \ + else \ + echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing CPU-only PyTorch"; \ + pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu; \ + fi # Various dependencies RUN pip3 install transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook From 89f10f4257887a3288435bbd7aa4bb0e628b8a33 Mon Sep 17 00:00:00 2001 From: zhihao Date: Sat, 19 Oct 2024 17:41:32 +0000 Subject: [PATCH 36/44] update --- docker/flexflow-environment/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 596d099f79..d571befdda 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -114,12 +114,12 @@ COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytor RUN if [ "$FF_GPU_BACKEND" == "cuda" ] ; then \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing PyTorch with CUDA"; \ chmod +x /usr/local/bin/install_pytorch.sh && \ - /usr/local/bin/install_pytorch.sh ${cuda_version} && \ - rm /usr/local/bin/install_pytorch.sh; \ + /usr/local/bin/install_pytorch.sh ${cuda_version}; \ else \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing CPU-only PyTorch"; \ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu; \ fi +RUN rm /usr/local/bin/install_pytorch.sh # Various dependencies RUN pip3 install transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook From d09ba0c26c11c1d5cd4f7f3935cbbb585d4de18c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 4 Nov 2024 11:52:24 -0500 Subject: [PATCH 37/44] ChatCompletion + Multi-EOS support (#1535) * init * support templates * support for multiple eos token ids * fix * fix * fix conda env for ci --- conda/flexflow.yml | 6 +- include/flexflow/flexflow_c.h | 7 +- include/flexflow/request_manager.h | 6 +- inference/incr_decoding/incr_decoding.cc | 19 +++- inference/peft/peft.cc | 19 +++- inference/peft/peft_bwd_benchmark.cc | 19 +++- inference/peft/peft_fwd_benchmark.cc | 19 +++- inference/peft/req_rate_benchmark.cc | 19 +++- inference/python/chat.py | 100 ++++++++++++++++++++ inference/spec_infer/spec_infer.cc | 41 +++++--- python/flexflow/core/flexflow_cffi.py | 115 +++-------------------- python/flexflow/serve/serve.py | 54 ++++++++++- src/c/flexflow_c.cc | 28 ++++-- src/runtime/request_manager.cc | 32 +++++-- 14 files changed, 327 insertions(+), 157 deletions(-) create mode 100644 inference/python/chat.py diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 091ba929e4..771b40ecd5 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -16,9 +16,9 @@ dependencies: - qualname>=0.1.0 - keras_preprocessing>=1.1.2 - numpy>=1.16.0 - - torch>=1.13.1 --index-url https://download.pytorch.org/whl/cpu - - torchaudio>=0.13.1 --index-url https://download.pytorch.org/whl/cpu - - torchvision>=0.14.1 --index-url https://download.pytorch.org/whl/cpu + - torch>=1.13.1 + - torchaudio>=0.13.1 + - torchvision>=0.14.1 - regex - onnx - transformers>=4.31.0 diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 52f67d8efb..6501b0658c 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -653,6 +653,7 @@ void flexflow_model_generate(flexflow_model_t handle_, char **output_texts, int *max_lengths, int *max_new_tokens_, + bool *add_special_tokens_, flexflow_peft_model_id_t *peft_model_ids, char const **dataset_filepaths, int *training_steps, @@ -1019,6 +1020,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num( void flexflow_request_manager_set_max_sequence_length( flexflow_request_manager_t handle_, int max_seq_length); +int flexflow_request_manager_get_max_sequence_length( + flexflow_request_manager_t handle_); + void flexflow_request_manager_set_enable_peft_finetuning( flexflow_request_manager_t handle_, bool enable_peft_finetuning_); @@ -1026,7 +1030,8 @@ void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, int bos_token_id, - int eos_token_id, + int num_eos_token_ids, + int *eos_token_ids, char const *tokenizer_filepath); void flexflow_request_manager_register_output_filepath( diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 94bfc74244..d62b610f3d 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -69,6 +69,7 @@ struct Request { PEFTModelID peft_model_id = PEFTModelID::NO_ID; int max_length = -1; int max_new_tokens = -1; + bool add_special_tokens = true; int initial_len; int ssm_cache_size = 0; int llm_cache_size = 0; @@ -146,7 +147,7 @@ class RequestManager { int register_ssm_model(FFModel *model); void register_tokenizer(ModelType model_type, int bos_token_id, - int eos_token_id, + std::vector eos_token_ids, std::string const &path); void register_output_filepath(std::string const &); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); @@ -178,6 +179,7 @@ class RequestManager { bool is_request_completed(RequestGuid const &guid); void trigger_request_completion_future(RequestGuid const &guid); // Methods for preparing next batches + bool is_eos_token(int token_id); bool check_inf_req_completion(BatchConfig const &old_bc, int i); void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc); BatchConfig prepare_next_batch(BatchConfig const &bc, @@ -301,7 +303,7 @@ class RequestManager { bool verbose; ModelType model_type; int bos_token_id; - int eos_token_id; + std::vector eos_token_ids; bool old_llama_tokenizer = false; std::string output_filepath; std::queue pending_infr_request_queue; diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index f8e16f24fa..f148d440e2 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -199,9 +199,20 @@ void FlexFlow::top_level_task(Task const *task, int bos_token_id = model_config.find("bos_token_id") == model_config.end() ? -1 : (int)model_config.at("bos_token_id"); - int eos_token_id = model_config.find("eos_token_id") == model_config.end() - ? -1 - : (int)model_config.at("eos_token_id"); + // parse eos token id, which can be either a single integer or an array of + // integers. Convert to std::vector + std::vector eos_token_ids; + if (model_config.find("eos_token_id") != model_config.end()) { + if (model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : model_config["eos_token_id"]) { + eos_token_ids.push_back(eos_token_id); + } + } else { + eos_token_ids.push_back(model_config["eos_token_id"]); + } + } else { + eos_token_ids.push_back(-1); + } assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -212,7 +223,7 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath); + model_type, bos_token_id, eos_token_ids, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 14fc653eba..0ab0b62ee8 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -229,9 +229,20 @@ void FlexFlow::top_level_task(Task const *task, int bos_token_id = model_config.find("bos_token_id") == model_config.end() ? -1 : (int)model_config.at("bos_token_id"); - int eos_token_id = model_config.find("eos_token_id") == model_config.end() - ? -1 - : (int)model_config.at("eos_token_id"); + // parse eos token id, which can be either a single integer or an array of + // integers. Convert to std::vector + std::vector eos_token_ids; + if (model_config.find("eos_token_id") != model_config.end()) { + if (model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : model_config["eos_token_id"]) { + eos_token_ids.push_back(eos_token_id); + } + } else { + eos_token_ids.push_back(model_config["eos_token_id"]); + } + } else { + eos_token_ids.push_back(-1); + } assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -267,7 +278,7 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath); + model_type, bos_token_id, eos_token_ids, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc index df9a1e35db..85e97ec4e8 100644 --- a/inference/peft/peft_bwd_benchmark.cc +++ b/inference/peft/peft_bwd_benchmark.cc @@ -230,9 +230,20 @@ void FlexFlow::top_level_task(Task const *task, int bos_token_id = model_config.find("bos_token_id") == model_config.end() ? -1 : (int)model_config.at("bos_token_id"); - int eos_token_id = model_config.find("eos_token_id") == model_config.end() - ? -1 - : (int)model_config.at("eos_token_id"); + // parse eos token id, which can be either a single integer or an array of + // integers. Convert to std::vector + std::vector eos_token_ids; + if (model_config.find("eos_token_id") != model_config.end()) { + if (model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : model_config["eos_token_id"]) { + eos_token_ids.push_back(eos_token_id); + } + } else { + eos_token_ids.push_back(model_config["eos_token_id"]); + } + } else { + eos_token_ids.push_back(-1); + } assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -251,7 +262,7 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath); + model_type, bos_token_id, eos_token_ids, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc index 9b020f5954..87322a42dd 100644 --- a/inference/peft/peft_fwd_benchmark.cc +++ b/inference/peft/peft_fwd_benchmark.cc @@ -230,9 +230,20 @@ void FlexFlow::top_level_task(Task const *task, int bos_token_id = model_config.find("bos_token_id") == model_config.end() ? -1 : (int)model_config.at("bos_token_id"); - int eos_token_id = model_config.find("eos_token_id") == model_config.end() - ? -1 - : (int)model_config.at("eos_token_id"); + // parse eos token id, which can be either a single integer or an array of + // integers. Convert to std::vector + std::vector eos_token_ids; + if (model_config.find("eos_token_id") != model_config.end()) { + if (model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : model_config["eos_token_id"]) { + eos_token_ids.push_back(eos_token_id); + } + } else { + eos_token_ids.push_back(model_config["eos_token_id"]); + } + } else { + eos_token_ids.push_back(-1); + } assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -251,7 +262,7 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath); + model_type, bos_token_id, eos_token_ids, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc index cde3b1c02e..ffa77478e1 100644 --- a/inference/peft/req_rate_benchmark.cc +++ b/inference/peft/req_rate_benchmark.cc @@ -292,9 +292,20 @@ void FlexFlow::top_level_task(Task const *task, int bos_token_id = model_config.find("bos_token_id") == model_config.end() ? -1 : (int)model_config.at("bos_token_id"); - int eos_token_id = model_config.find("eos_token_id") == model_config.end() - ? -1 - : (int)model_config.at("eos_token_id"); + // parse eos token id, which can be either a single integer or an array of + // integers. Convert to std::vector + std::vector eos_token_ids; + if (model_config.find("eos_token_id") != model_config.end()) { + if (model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : model_config["eos_token_id"]) { + eos_token_ids.push_back(eos_token_id); + } + } else { + eos_token_ids.push_back(model_config["eos_token_id"]); + } + } else { + eos_token_ids.push_back(-1); + } assert(model_type != ModelType::UNKNOWN && "Invalid LLM model type passed (or no type was passed)."); @@ -313,7 +324,7 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath); + model_type, bos_token_id, eos_token_ids, tokenizer_filepath); rm->register_output_filepath(file_paths.output_file_path); rm->set_enable_peft_finetuning(enable_peft_finetuning); diff --git a/inference/python/chat.py b/inference/python/chat.py new file mode 100644 index 0000000000..13ece116a6 --- /dev/null +++ b/inference/python/chat.py @@ -0,0 +1,100 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import flexflow.serve as ff +import argparse, json, os +from types import SimpleNamespace + + +def get_configs(): + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 1, + "memory_per_gpu": 30000, + "zero_copy_memory_per_node": 60000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 4, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "peft_weight_reserve_space_size": 1024, # 1GB + "profiling": False, + "benchmarking": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "meta-llama/Meta-Llama-3-8B-Instruct", + # optional parameters + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": False, + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +def main(): + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + + # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs + ff.init(configs_dict) + + # Create the FlexFlow LLM + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + ) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=1, + max_seq_length=2048, + max_tokens_per_batch=256, + ) + + llm.start_server() + + messages=[ + {"role": "system", "content": "You are a helpful an honest programming assistant."}, + {"role": "user", "content": "Is Rust better than Python?"}, + ] + llm.generate(messages, max_new_tokens=256) + + llm.stop_server() + + +if __name__ == "__main__": + print("flexflow inference example (incremental decoding)") + main() diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 134ae70c4a..7ec3cf61f5 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -47,7 +47,8 @@ struct ModelMeta { std::string llm_weights_path; std::string llm_model_config_path; - int bos_token_id, eos_token_id; + int bos_token_id; + std::vector eos_token_ids; std::vector ssm_model_types; std::vector ssm_model_config_paths; @@ -191,10 +192,20 @@ void get_model_meta(FilePaths &file_paths, llm_model_config.find("bos_token_id") == llm_model_config.end() ? -1 : (int)llm_model_config.at("bos_token_id"); - model_metadata.eos_token_id = - llm_model_config.find("eos_token_id") == llm_model_config.end() - ? -1 - : (int)llm_model_config.at("eos_token_id"); + // parse eos token id, which can be either a single integer or an array of + // integers. Convert to std::vector + std::vector eos_token_ids; + if (llm_model_config.find("eos_token_id") != llm_model_config.end()) { + if (llm_model_config["eos_token_id"].is_array()) { + for (auto &eos_token_id : llm_model_config["eos_token_id"]) { + model_metadata.eos_token_ids.push_back(eos_token_id); + } + } else { + model_metadata.eos_token_ids.push_back(llm_model_config["eos_token_id"]); + } + } else { + model_metadata.eos_token_ids.push_back(-1); + } for (auto ssm_model_name : model_metadata.model_names.ssm_model_names) { std::string ssm_config_path = join_path({file_paths.cache_folder_path, @@ -241,15 +252,15 @@ void get_model_meta(FilePaths &file_paths, ssm_model_config.find("bos_token_id") == ssm_model_config.end() ? -1 : (int)ssm_model_config.at("bos_token_id"); - int ssm_eos_id = - ssm_model_config.find("eos_token_id") == ssm_model_config.end() - ? -1 - : (int)ssm_model_config.at("eos_token_id"); - if (ssm_bos_id != model_metadata.bos_token_id || - ssm_eos_id != model_metadata.eos_token_id) { - printf("Warning: bos/eos token id mismatch between LLM and one of the " - "SSMs!\n"); - } + // int ssm_eos_id = + // ssm_model_config.find("eos_token_id") == ssm_model_config.end() + // ? -1 + // : (int)ssm_model_config.at("eos_token_id"); + // if (ssm_bos_id != model_metadata.bos_token_id || + // ssm_eos_id != model_metadata.eos_token_id) { + // printf("Warning: bos/eos token id mismatch between LLM and one of the " + // "SSMs!\n"); + // } model_metadata.ssm_model_types.push_back(ssm_model_type); model_metadata.ssm_model_config_paths.push_back(ssm_config_path); model_metadata.ssm_model_weights_paths.push_back(ssm_weights_path); @@ -310,7 +321,7 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer(model_metadata.llm_model_type, model_metadata.bos_token_id, - model_metadata.eos_token_id, + model_metadata.eos_token_ids, model_metadata.llm_tokenizer_path); rm->register_output_filepath(file_paths.output_file_path); diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index e2240f0b4f..59e62ea023 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -1588,7 +1588,12 @@ def register_tokenizer( c_model_type = enum_to_int(ModelType, model_type) c_tokenizer_filepath = get_c_name(tokenizer_filepath) return ffc().flexflow_request_manager_register_tokenizer( - self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath + self.handle, + c_model_type, + bos_token_id, + len(eos_token_id), + eos_token_id, + c_tokenizer_filepath, ) def register_output_filepath(self, output_filepath): @@ -1622,6 +1627,9 @@ def set_max_sequence_length(self, max_length): self.handle, max_length ) + def get_max_sequence_length(self): + return ffc().flexflow_request_manager_get_max_sequence_length(self.handle) + def set_enable_peft_finetuning(self, enable_peft_finetuning): return ffc().flexflow_request_manager_set_enable_peft_finetuning( self.handle, enable_peft_finetuning @@ -2060,6 +2068,7 @@ class Request: prompt: Optional[str] = None max_length: int = -1 max_new_tokens: int = -1 + add_special_tokens: bool = True peft_model_id: Optional[PEFTModelID] = None dataset_filepath: Optional[str] = None max_training_steps: int = 1 @@ -4652,91 +4661,6 @@ def get_output_tensor(self, ffmodel, data_type): assert ret_val == True return np_array - def _estimate_max_num_tokens( - max_length: int, max_new_tokens: int, prompt: Optional[str] - ): - if prompt is None: - assert max_new_tokens == -1 - return ( - math.ceil(max_new_tokens + len(prompt.split()) * 1.5) - if max_new_tokens != -1 - else max_length - ) - - def _estimate_max_num_chars( - max_length: int, max_new_tokens: int, prompt: Optional[str] - ): - return ( - 5 * FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt) - + 100 - ) - - # deprecated - def generate_inf_only( - self, - prompt_list: List[str], - max_length: int, - max_new_tokens: int, - ): - if max_length != -1 and max_new_tokens != -1: - raise ValueError( - f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) seem to have been set." - ) - if max_length == -1 and max_new_tokens == -1: - raise ValueError( - f"Both `max_new_tokens` (={max_new_tokens}) and `max_length`(={max_length}) were left unset." - ) - assert isinstance(prompt_list, list) - c_input_texts = [get_c_name(prompt) for prompt in prompt_list] - c_output_texts = [ - ffi.new( - "char[]", - FFModel._estimate_max_num_chars(max_length, max_new_tokens, prompt), - ) - for prompt in prompt_list - ] - c_output_length_and_tokens = [ - ffi.new( - "int[]", - FFModel._estimate_max_num_tokens(max_length, max_new_tokens, prompt) - + 100, - ) - for prompt in prompt_list - ] - c_request_types = [ - enum_to_int(RequestType, RequestType.REQ_INFERENCE) for _ in prompt_list - ] - max_lengths = [max_length for _ in prompt_list] - max_new_tokens_ = [max_new_tokens for _ in prompt_list] - peft_model_ids = [PEFTModelID.no_id_handle() for _ in prompt_list] - dataset_filepaths = [ffi.NULL for _ in prompt_list] - training_steps = [0 for _ in prompt_list] - num_finetuning_losses = ffi.new("int *") - c_finetuning_losses = ffi.new("float[]", 0) - ffc().flexflow_model_generate( - self.handle, - len(prompt_list), - c_request_types, - c_input_texts, - c_output_texts, - max_lengths, - max_new_tokens_, - peft_model_ids, - dataset_filepaths, - training_steps, - c_output_length_and_tokens, - num_finetuning_losses, - c_finetuning_losses, - ) - from flexflow.serve import GenerationResult - - return [ - GenerationResult( - text=ffi.string(c_output_text), tokens=[], finetuning_losses=[] - ) - for c_output_text in c_output_texts - ] - def generate(self, requests_list: List[Request]): assert isinstance(requests_list, list) for request in requests_list: @@ -4756,37 +4680,27 @@ def generate(self, requests_list: List[Request]): raise ValueError( f"Finetuning requests should not have `max_new_tokens` set." ) + max_sequence_length = RequestManager().get_max_sequence_length() c_input_texts = [ get_c_name(request.prompt) for request in requests_list ] # entry will be None for finetuning requests c_output_texts = [ ( - ffi.new( - "char[]", - FFModel._estimate_max_num_chars( - request.max_length, request.max_new_tokens, request.prompt - ), - ) + ffi.new("char[]", max_sequence_length * 5) if request.req_type == RequestType.REQ_INFERENCE else ffi.NULL ) for request in requests_list ] c_output_length_and_tokens = [ - ffi.new( - "int[]", - FFModel._estimate_max_num_tokens( - request.max_length, request.max_new_tokens, request.prompt - ) - + 100, - ) - for request in requests_list + ffi.new("int[]", max_sequence_length + 100) for request in requests_list ] c_request_types = [ enum_to_int(RequestType, request.req_type) for request in requests_list ] max_lengths = [request.max_length for request in requests_list] max_new_tokens_ = [request.max_new_tokens for request in requests_list] + add_special_tokens_ = [request.add_special_tokens for request in requests_list] peft_model_ids = [ ( @@ -4813,6 +4727,7 @@ def generate(self, requests_list: List[Request]): c_output_texts, max_lengths, max_new_tokens_, + add_special_tokens_, peft_model_ids, dataset_filepaths, training_steps, diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index c8540a6ed3..e4248a2fc1 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -27,7 +27,7 @@ MPTConfig, ) from flexflow.core import * -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from peft import PeftModel, PeftConfig, LoraConfig from huggingface_hub import HfApi import torch, shutil, hashlib, json, gc @@ -104,6 +104,7 @@ def __init__( self.output_file = output_file self.rm = None self.pefts = {} + self.tokenizer=None def __del__(self): # Stop the background server before deleting the object @@ -499,6 +500,10 @@ def compile( eos_token_id = ( -1 if self.hf_config.eos_token_id is None else self.hf_config.eos_token_id ) + if type(eos_token_id) == int: + eos_token_id = [eos_token_id] + elif type(eos_token_id) != list: + raise ValueError("eos_token_id must be an integer or a list of integers") self.rm.register_tokenizer( self.model_type, bos_token_id, eos_token_id, self.tokenizer_path ) @@ -548,9 +553,29 @@ def _generate(self, requests: List[Request]): ) return self.model.ffmodel.generate(requests) + def __chat2prompt(self, messages: List[dict]): + """Convert a list of messages to a single prompt string + + :param messages: The list of messages to convert + :type messages: List[dict] + :return: The prompt string + :rtype: str + """ + # ensure that each element is a dictionary, containing the "role" and "content" keys + for message in messages: + if type(message) != dict or "role" not in message or "content" not in message: + raise ValueError( + "Each element in the list must be a dictionary with the keys 'role' and 'content'" + ) + if self.tokenizer is None: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if self.tokenizer.chat_template is None: + raise ValueError(f"Model {self.model_name} does not support chat completion") + return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + def generate( self, - requests_or_prompts: Union[str, List[str], Request, List[Request]], + requests_or_prompts: Union[str, List[str], List[dict], Request, List[Request]], max_length: int = -1, max_new_tokens: int = -1, ): @@ -591,7 +616,30 @@ def generate( for req in requests_or_prompts ] return self._generate(requests) - else: + elif type(requests_or_prompts[0]) == dict: + prompt = self.__chat2prompt(requests_or_prompts) + request = Request( + req_type=RequestType.REQ_INFERENCE, + prompt=prompt, + max_length=max_length, + max_new_tokens=max_new_tokens, + add_special_tokens=False, + ) + return self._generate([request]) + elif type(requests_or_prompts[0]) == list: + prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts] + requests = [ + Request( + req_type=RequestType.REQ_INFERENCE, + prompt=prompt, + max_length=max_length, + max_new_tokens=max_new_tokens, + add_special_tokens=False, + ) + for prompt in prompts + ] + return self._generate(requests) + elif type(requests_or_prompts[0]) == Request: print(requests_or_prompts) return self._generate(requests_or_prompts) else: diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index bfa60a6d54..da90c586e3 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -1685,6 +1685,7 @@ void flexflow_model_generate(flexflow_model_t handle_, char **output_texts, int *max_lengths, int *max_new_tokens_, + bool *add_special_tokens_, flexflow_peft_model_id_t *peft_model_ids, char const **dataset_filepaths, int *training_steps, @@ -1701,22 +1702,25 @@ void flexflow_model_generate(flexflow_model_t handle_, inference_req.prompt = text_str; inference_req.max_length = max_lengths[i]; inference_req.max_new_tokens = max_new_tokens_[i]; + inference_req.add_special_tokens = add_special_tokens_[i]; PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); if (peft_model_id != nullptr) { inference_req.peft_model_id = *peft_model_id; } requests.push_back(inference_req); - DEBUG_PRINT("[Model] generate[%d] %p %s %i %i", + DEBUG_PRINT("[Model] generate[%d] %p %s %i %i %i", i, handle, text_str.c_str(), max_lengths[i], - max_new_tokens_[i]); + max_new_tokens_[i], + add_special_tokens_[i]); } else if (request_types[i] == RequestType::REQ_FINETUNING) { Request fine_tuning_req; fine_tuning_req.req_type = RequestType::REQ_FINETUNING; fine_tuning_req.max_length = max_lengths[i]; fine_tuning_req.max_new_tokens = max_new_tokens_[i]; + fine_tuning_req.add_special_tokens = add_special_tokens_[i]; PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]); if (peft_model_id != nullptr) { fine_tuning_req.peft_model_id = *peft_model_id; @@ -1725,12 +1729,13 @@ void flexflow_model_generate(flexflow_model_t handle_, fine_tuning_req.dataset_filepath = dataset_fp; fine_tuning_req.max_training_steps = training_steps[i]; requests.push_back(fine_tuning_req); - DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i", + DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i %i %i", i, handle, dataset_fp.c_str(), max_lengths[i], - max_new_tokens[i], + max_new_tokens_[i], + add_special_tokens_[i], training_steps[i]); } else { assert(false && "Unknown request type"); @@ -2754,6 +2759,12 @@ void flexflow_request_manager_set_max_sequence_length( DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length); } +int flexflow_request_manager_get_max_sequence_length( + flexflow_request_manager_t handle_) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + return handle->get_max_sequence_length(); +} + void flexflow_request_manager_set_enable_peft_finetuning( flexflow_request_manager_t handle_, bool enable_peft_finetuning_) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); @@ -2766,14 +2777,19 @@ void flexflow_request_manager_register_tokenizer( flexflow_request_manager_t handle_, enum ModelType model_type, int bos_token_id, - int eos_token_id, + int num_eos_token_ids, + int *eos_token_ids, char const *tokenizer_filepath) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); assert(tokenizer_filepath != nullptr && "Cannot convert nullptr char * to std::string"); std::string const tokenizer_filepath_str(tokenizer_filepath); + std::vector eos_token_ids_vec; + for (int i = 0; i < num_eos_token_ids; i++) { + eos_token_ids_vec.push_back(eos_token_ids[i]); + } handle->register_tokenizer( - model_type, bos_token_id, eos_token_id, tokenizer_filepath_str); + model_type, bos_token_id, eos_token_ids_vec, tokenizer_filepath_str); DEBUG_PRINT( "[RequestManager] register tokenizer %p %s", handle, tokenizer_filepath); } diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 5fbee65e6d..193abbb455 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -56,6 +56,7 @@ std::ostream &operator<<(std::ostream &os, Request const &req) { os << " peft_model_id: " << req.peft_model_id << "\n"; os << " max_length: " << req.max_length << "\n"; os << " max_new_tokens: " << req.max_new_tokens << "\n"; + os << " add_special_tokens: " << req.add_special_tokens << "\n"; os << " initial_len: " << req.initial_len << "\n"; os << " ssm_cache_size: " << req.ssm_cache_size << "\n"; os << " llm_cache_size: " << req.llm_cache_size << "\n"; @@ -178,11 +179,11 @@ void RequestManager::set_inference_finished(bool finished) { void RequestManager::register_tokenizer(ModelType type, int bos_token_id, - int eos_token_id, + std::vector eos_token_ids, std::string const &path) { this->model_type = type; this->bos_token_id = bos_token_id; - this->eos_token_id = eos_token_id; + this->eos_token_ids = eos_token_ids; std::filesystem::path tokenizer_folder(path); if (model_type == ModelType::LLAMA) { @@ -271,6 +272,7 @@ RequestManager::RequestGuid request.guid = next_available_guid++; request.max_length = request_.max_length; request.max_new_tokens = request_.max_new_tokens; + request.add_special_tokens = request_.add_special_tokens; // both unset if (request.max_length == -1 && request.max_new_tokens == -1) { request.max_length = get_max_sequence_length() - 1; @@ -285,7 +287,8 @@ RequestManager::RequestGuid } request.peft_model_id = request_.peft_model_id; request.warmup = request_.warmup; - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + if (bos_token_id >= 0 && model_type != ModelType::FALCON && + request.add_special_tokens) { request.tokens.push_back(bos_token_id); } if (request_.benchmarking_tokens >= 0) { @@ -378,6 +381,7 @@ RequestManager::RequestGuid request.initial_len = 0; request.max_length = request_.max_length; request.max_new_tokens = request_.max_new_tokens; + request.add_special_tokens = request_.add_special_tokens; if (request.max_new_tokens != -1) { std::cerr << "Error: max_new_tokens is not allowed for PEFT finetuning requests" @@ -402,7 +406,8 @@ RequestManager::RequestGuid request.benchmarking_tokens = request_.benchmarking_tokens; std::vector input_tokens; std::vector output_tokens; - bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON); + bool bos_added = (bos_token_id >= 0 && request.add_special_tokens && + model_type != ModelType::FALCON); if (bos_added) { input_tokens.push_back(bos_token_id); } @@ -424,7 +429,8 @@ RequestManager::RequestGuid std::string output_text(""); std::vector input_tokens; input_tokens = this->tokenizer_->Encode(text); - if (bos_token_id >= 0 && model_type != ModelType::FALCON) { + if (bos_token_id >= 0 && model_type != ModelType::FALCON && + request.add_special_tokens) { input_tokens.insert(input_tokens.begin(), bos_token_id); } std::vector output_tokens = @@ -557,6 +563,15 @@ BatchConfig RequestManager::prepare_next_batch_task( return rm->prepare_next_batch(*bc, result); } +bool RequestManager::is_eos_token(int token_id) { + for (int eos_token : eos_token_ids) { + if (token_id == eos_token) { + return true; + } + } + return false; +} + bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc, int i) { Request &request = all_requests[old_bc.requestsInfo[i].request_guid]; @@ -564,7 +579,7 @@ bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc, // printf("model_type = %d\n", this->model_type); if (request.tokens.size() >= old_bc.requestsInfo[i].max_length) { request_completed = true; - } else if (request.tokens.back() == eos_token_id) { + } else if (is_eos_token(request.tokens.back())) { // Encounter EOS token id request_completed = true; } @@ -673,6 +688,7 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token if (model_type == ModelType::LLAMA && old_llama_tokenizer && + request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1134,6 +1150,7 @@ BeamSearchBatchConfig // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token if (model_type == ModelType::LLAMA && old_llama_tokenizer && + request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1277,6 +1294,7 @@ BeamSearchBatchConfig // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token if (model_type == ModelType::LLAMA && old_llama_tokenizer && + request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } @@ -1325,7 +1343,7 @@ BeamSearchBatchConfig // Unlike Huggingface, the sentencepiece C++ library automatically removes // the BOS token if (model_type == ModelType::LLAMA && old_llama_tokenizer && - request.tokens.at(0) == bos_token_id) { + request.add_special_tokens && request.tokens.at(0) == bos_token_id) { output = " " + output; } log_req_mgr.print("Output: %s", output.c_str()); From fc884fec852f1d3e3cadd6da1ce1b8e1b8630252 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 4 Nov 2024 19:24:56 +0000 Subject: [PATCH 38/44] fix inference test --- tests/peft/hf_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py index 94fb96f029..3760f05055 100644 --- a/tests/peft/hf_utils.py +++ b/tests/peft/hf_utils.py @@ -223,15 +223,15 @@ def save_lora_weights(self, model, pre_finetuning=False): if not pre_finetuning: self.step_count += 1 - def on_step_end( - self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs - ): - self.save_lora_weights(model, pre_finetuning=False) - - def on_step_begin( - self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs - ): - self.save_lora_weights(model, pre_finetuning=True) + def on_step_end(self, args, state, control, **kwargs): + model_ = kwargs.get("model", None) + assert model_ is not None + self.save_lora_weights(model_, pre_finetuning=False) + + def on_step_begin(self, args, state, control, **kwargs): + model_ = kwargs.get("model", None) + assert model_ is not None + self.save_lora_weights(model_, pre_finetuning=True) def on_train_end(self, args, state, control, **kwargs): if verbose: From 2047bdd26c1cf89854c2666edfcbbaa8a799175a Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Wed, 6 Nov 2024 01:34:09 +0000 Subject: [PATCH 39/44] fix --- tests/fine_grained_alignment_test.sh | 2 +- tests/inference/inference_alignment_test.py | 66 +++++++++++---------- tests/peft/alignment/align_test_utils.py | 4 +- 3 files changed, 39 insertions(+), 33 deletions(-) diff --git a/tests/fine_grained_alignment_test.sh b/tests/fine_grained_alignment_test.sh index 9ad26318f9..0ef1341951 100755 --- a/tests/fine_grained_alignment_test.sh +++ b/tests/fine_grained_alignment_test.sh @@ -11,7 +11,7 @@ CACHE_PATH=${FF_CACHE_PATH:-"~/.cache/flexflow"} NUM_STEPS=${NUM_STEPS:-2} cleanup() { - rm -rf "${CACHE_PATH}"/debug ./fine_grained_alignment_config.json ./inference/output/fine_grained_alignment_test_ff.txt ./inference/output/fine_grained_alignment_test_hf.txt + eval rm -rf "${CACHE_PATH}/debug" ./fine_grained_alignment_config.json ./inference/output/fine_grained_alignment_test_ff.txt ./inference/output/fine_grained_alignment_test_hf.txt } # Cd into directory holding this script diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py index 6fff4906f7..8dab7ff43b 100644 --- a/tests/inference/inference_alignment_test.py +++ b/tests/inference/inference_alignment_test.py @@ -17,7 +17,7 @@ def check_bwd_pass(self): def check_step(self, step_idx, learning_rate=0.001): raise NotImplementedError() -class LllamaAlignmentTest(AlignmentTest): +class LlamaAlignmentTest(AlignmentTest): def __init__(self, hf_config, tp_degree=1): self.hf_config = hf_config self.num_layers = self.hf_config.num_hidden_layers @@ -168,7 +168,10 @@ def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPTyp ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',') self.ff_batch_size = ff_tensor.shape[0] - ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) + if "lm_head" in ff_tensor_path: + ff_shape = replace_value(ff_shape, 1, self.ff_batch_size) + else: + ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size) ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)] if self.tp_degree > 1: # if replicate, check that they are identical @@ -356,11 +359,14 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) - ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) + ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)[:,:,-1].squeeze() + hf_tensor = hf_tensor.squeeze() + print(hf_tensor.shape, ff_tensor.shape) compare(hf_tensor, ff_tensor, label="LM head input") output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) - ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)[:,:,-1].squeeze() + hf_tensor = hf_tensor.squeeze() compare(hf_tensor, ff_tensor, label="LM head output") class OPTAlignmentTest(AlignmentTest): @@ -664,17 +670,17 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance assert torch.allclose(ff_qkv_tensor_out, ff_attn_tensor_in) # Compared scaled qproj - hf_tensor_name = f"layers.{i}.self_attn.scaled_qproj" - input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) - output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) - scaled_qproj_in = get_hf_tensor(hf_tensor_name, input_c) - scaled_qproj_out = get_hf_tensor(hf_tensor_name, output_c) - assert torch.allclose(scaled_qproj_in, scaled_qproj_out) - ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.scaled_qkv_proj" - scaled_qkv_proj0 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0"), [64*6,3,9]) - scaled_qkv_proj1 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0").replace("shard_0", "shard_1"), [64*6,3,9]) - ff_scaled_qkv_proj = np.concatenate([scaled_qkv_proj0, scaled_qkv_proj1], axis=0) - ff_scaled_q_proj = torch.from_numpy(ff_scaled_qkv_proj[:, :1, :]).to(scaled_qproj_out.dtype) + # hf_tensor_name = f"layers.{i}.self_attn.scaled_qproj" + # input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # scaled_qproj_in = get_hf_tensor(hf_tensor_name, input_c) + # scaled_qproj_out = get_hf_tensor(hf_tensor_name, output_c) + # assert torch.allclose(scaled_qproj_in, scaled_qproj_out) + # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.scaled_qkv_proj" + # scaled_qkv_proj0 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0"), [64*6,3,9]) + # scaled_qkv_proj1 = load_ff_tensor(os.path.join(ff_fwd_folder, f"{ff_tensor_name}.output_0").replace("shard_0", "shard_1"), [64*6,3,9]) + # ff_scaled_qkv_proj = np.concatenate([scaled_qkv_proj0, scaled_qkv_proj1], axis=0) + # ff_scaled_q_proj = torch.from_numpy(ff_scaled_qkv_proj[:, :1, :]).to(scaled_qproj_out.dtype) # print("HF scaled qproj:") # print(scaled_qproj_out.squeeze().T) # print("FF scaled q proj:") @@ -688,15 +694,15 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance # check that out_proj input, attn_scores out and input are identical on the hf side - hf_tensor_name = f"layers.{i}.self_attn.attn_scores" - input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) - output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) - attn_scores_in = get_hf_tensor(hf_tensor_name, input_c) - attn_scores_out = get_hf_tensor(hf_tensor_name, output_c) + # hf_tensor_name = f"layers.{i}.self_attn.attn_scores" + # input_c = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # output_c = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # attn_scores_in = get_hf_tensor(hf_tensor_name, input_c) + # attn_scores_out = get_hf_tensor(hf_tensor_name, output_c) hf_tensor_name = f"layers.{i}.self_attn.out_proj" - out_proj_in = get_hf_tensor(hf_tensor_name, input_c) - assert torch.allclose(attn_scores_in, attn_scores_out) - assert torch.allclose(attn_scores_in, out_proj_in) + # out_proj_in = get_hf_tensor(hf_tensor_name, input_c) + # assert torch.allclose(attn_scores_in, attn_scores_out) + # assert torch.allclose(attn_scores_in, out_proj_in) # Compare out proj input. This should be the output of the attention without any bias involved hf_tensor_name = f"layers.{i}.self_attn.out_proj" @@ -707,12 +713,12 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance print("comparing attention tensor: ", hf_tensor_name, " and ", ff_tensor_name) compare(hf_tensor, ff_tensor, label=f"Attention o-proj {i} input") - hf_tensor_name = f"layers.{i}.self_attn.attn_scores" - ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" - output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) - hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) - ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) - compare(hf_tensor, ff_tensor, label=f"Attention {i} output") + # hf_tensor_name = f"layers.{i}.self_attn.attn_scores" + # ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" + # output_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) + # hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + # ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + # compare(hf_tensor, ff_tensor, label=f"Attention {i} output") # hf_tensor_name = f"layers.{i}.final_layer_norm" # ff_tensor_name = f"layers.{i}.layers.{i}.add_bias_residual_layer_norm" @@ -808,7 +814,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance hf_config = AutoConfig.from_pretrained(args.model_name) alignment_class = None if hf_config.architectures[0] == "LlamaForCausalLM": - alignment_class = LllamaAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree) + alignment_class = LlamaAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree) elif hf_config.architectures[0] == "OPTForCausalLM": alignment_class = OPTAlignmentTest(hf_config, tp_degree=args.tensor_parallelism_degree) diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index 3085bbda56..f5ed8ae65b 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -472,9 +472,9 @@ class TensorComparisonIdxs: def replace_value(lst, old_value, new_value): occurrences = lst.count(old_value) if occurrences == 0: - raise ValueError(f"Value {old_value} not found in the list.") + raise ValueError(f"Value {old_value} not found in the list: {lst}") elif occurrences > 1: - warnings.warn(f"Multiple instances of {old_value} found in the list.") + warnings.warn(f"Multiple instances of {old_value} found in the list: {lst}") occurrence_idx=0 for i, value in enumerate(lst): if value == old_value: From 2fd529d48579d0f7165f2965cc538107bcaf951f Mon Sep 17 00:00:00 2001 From: Pinku Surana Date: Fri, 8 Nov 2024 11:50:00 -0500 Subject: [PATCH 40/44] Add support for OFI conduit in GASNet (#1538) GASNet's OFI conduit is used for the Slingshot network on Perlmutter and Frontier. It takes an additional configuration, GASNet_SYSTEM, configured for either slingshot10 or slingshot11. --- cmake/legion.cmake | 1 + config/config.inc | 7 +++++++ config/config.linux | 3 +++ 3 files changed, 11 insertions(+) diff --git a/cmake/legion.cmake b/cmake/legion.cmake index 2afb507d3b..adcf5618f8 100644 --- a/cmake/legion.cmake +++ b/cmake/legion.cmake @@ -132,6 +132,7 @@ else() set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version") set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit") set(GASNet_CONDUIT ${FF_GASNET_CONDUIT}) + set(GASNet_SYSTEM ${FF_GASNET_SYSTEM}) elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx") set(ucx_ROOT ${UCX_PATH}/lib/cmake) message(STATUS "Find ucx: ${UCX_PATH}") diff --git a/config/config.inc b/config/config.inc index 011fe890fb..b4356bf078 100644 --- a/config/config.inc +++ b/config/config.inc @@ -118,6 +118,13 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi" elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp" + elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then + SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi" + if [ "$FF_GASNET_SYSTEM" = "slingshot11" ]; then + SET_LEGION_NETWORKS+=" -DFF_GASNET_SYSTEM=slingshot11" + elif [ "$FF_GASNET_SYSTEM" = "slingshot10" ]; then + SET_LEGION_NETWORKS+=" -DFF_GASNET_SYSTEM=slingshot10" + fi fi elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx" diff --git a/config/config.linux b/config/config.linux index 09976cfa03..aae7901494 100755 --- a/config/config.linux +++ b/config/config.linux @@ -61,6 +61,9 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-} # select GASNET conduit FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ibv} +# select GASNET system (usually with OFI conduit) +FF_GASNET_SYSTEM=${FF_GASNET_SYSTEM:-slingshot11} + # set UCX dir if Legion networks is set to ucx UCX_DIR=${UCX_DIR:-""} From 1bef1a311d9d056acd207b26e0541a91d26125f7 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 18 Nov 2024 20:00:41 +0000 Subject: [PATCH 41/44] update --- docker/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/run.sh b/docker/run.sh index cdf9383052..46c63bab6f 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -120,4 +120,4 @@ if [ -f "$hf_token_path" ]; then hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token" fi -eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" +eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" From 7dcbd62f98061ea25938ecd8b4d13fbd3b8e638c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Mon, 18 Nov 2024 15:12:37 -0500 Subject: [PATCH 42/44] FlexLLM server demo (#1510) * init * update * update * update * update * add max new tokens parameter * backup * update * backup * lora configs serialize / deserialize into single file * backup * . * . * . * . * frontend * bug fix * fixes * fix * updates * fix * fix * fix * small fix * fix * fix reset input grad for non-activated loras * fix * update * demo fixes & readme * load weights in parallel * cleanup * cleanup * load weights faster in inference test * fix * cleanup and fixes * linting * fix * cleanup * docker run update --- docker/flexflow-environment/Dockerfile | 3 +- docker/run.sh | 13 +- include/flexflow/batch_config.h | 6 +- include/flexflow/config.h | 4 - include/flexflow/fftype.h | 1 + include/flexflow/flexflow_c.h | 11 +- include/flexflow/model.h | 11 +- include/flexflow/operator.h | 2 +- include/flexflow/ops/kernels/linear_kernels.h | 2 + .../ops/kernels/lora_linear_kernels.h | 38 +- include/flexflow/ops/lora_linear.h | 19 +- include/flexflow/ops/lora_linear_params.h | 51 +- include/flexflow/request_manager.h | 14 + include/flexflow/utils/file_loader.h | 23 +- .../flexflow/utils/peft_weight_allocator.h | 163 ++-- inference/models/falcon.cc | 8 + inference/models/llama.cc | 11 +- inference/models/mpt.cc | 8 + inference/models/opt.cc | 10 +- inference/models/starcoder.cc | 7 + inference/peft/peft.cc | 17 +- inference/peft/peft_bwd_benchmark.cc | 8 +- inference/peft/peft_fwd_benchmark.cc | 8 +- inference/peft/req_rate_benchmark.cc | 6 +- inference/python/chat.py | 23 +- inference/python/ff_peft.py | 51 +- inference/python/incr_decoding.py | 1 - inference/python/peft_demo/INSTRUCTIONS.md | 2 +- inference/python/peft_demo/demo.ipynb | 6 +- inference/python/peft_demo/demo.py | 5 +- inference/python/spec_infer.py | 1 - inference/python/streamlit/README.md | 18 + inference/python/streamlit/app.py | 188 +++++ inference/python/streamlit/fastapi_incr.py | 207 +++++ inference/utils/download_peft_model.py | 32 +- python/flexflow/core/__init__.py | 1 - python/flexflow/core/flexflow_cffi.py | 18 +- python/flexflow/serve/__init__.py | 9 - python/flexflow/serve/models/falcon.py | 4 + python/flexflow/serve/models/llama.py | 4 + python/flexflow/serve/models/mpt.py | 4 + python/flexflow/serve/models/opt.py | 4 + python/flexflow/serve/models/starcoder.py | 4 + python/flexflow/serve/serve.py | 424 ++++++----- src/c/flexflow_c.cc | 46 +- src/mapper/mapper.cc | 4 + src/ops/fused.cu | 3 + src/ops/kernels/linear_kernels.cu | 45 ++ src/ops/kernels/lora_linear_kernels.cu | 291 +++---- src/ops/linear.cc | 1 + src/ops/lora_linear.cc | 719 +++++------------- src/ops/lora_linear_params.cc | 147 +++- src/runtime/fftype.cc | 4 + src/runtime/file_loader.cc | 68 +- src/runtime/inference_manager.cc | 4 +- src/runtime/model.cc | 90 +-- src/runtime/model.cu | 31 +- src/runtime/peft_weight_allocator.cc | 319 ++++++++ src/runtime/peft_weight_allocator.cu | 80 ++ src/runtime/request_manager.cc | 121 ++- .../inference/huggingface_inference_simple.py | 51 ++ tests/inference/huggingface_pipeline.py | 33 + tests/inference/inference_alignment_test.py | 2 +- .../python_test_configs/generate_configs.py | 12 +- tests/peft/alignment/align_test_utils.py | 2 +- tests/peft/hf_finetune.py | 2 +- tests/peft/peft_alignment_test.py | 73 +- tests/peft_test.sh | 12 +- 68 files changed, 2326 insertions(+), 1284 deletions(-) create mode 100644 inference/python/streamlit/README.md create mode 100644 inference/python/streamlit/app.py create mode 100644 inference/python/streamlit/fastapi_incr.py create mode 100644 src/runtime/peft_weight_allocator.cc create mode 100644 src/runtime/peft_weight_allocator.cu create mode 100644 tests/inference/huggingface_inference_simple.py create mode 100644 tests/inference/huggingface_pipeline.py diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index d571befdda..2af81de11f 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -7,7 +7,7 @@ LABEL org.opencontainers.image.description="FlexFlow environment container" SHELL ["/bin/bash", "-c"] # Install basic dependencies -RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \ +RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq openssh-client && \ rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \ apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \ @@ -125,6 +125,7 @@ RUN pip3 install transformers>=4.31.0 sentencepiece einops RUN pip3 install tensorflow notebook # PEFT-related RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft +RUN pip3 install streamlit # Install Rust RUN curl https://sh.rustup.rs -sSf | sh -s -- -y diff --git a/docker/run.sh b/docker/run.sh index 46c63bab6f..759da521aa 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -17,6 +17,11 @@ hip_version=${hip_version:-"empty"} ATTACH_GPUS=${ATTACH_GPUS:-true} gpu_arg="" if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi +FORWARD_STREAMLIT_PORT=${FORWARD_STREAMLIT_PORT:-true} +port_forward_arg="" +if $FORWARD_STREAMLIT_PORT ; then + port_forward_arg+="-p 8501:8501" +fi # Amount of shared memory to give the Docker container access to @@ -120,4 +125,10 @@ if [ -f "$hf_token_path" ]; then hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token" fi -eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" +ssh_key_volume="" +ssh_key_path="$HOME/.ssh/id_rsa" +if [ -f "$ssh_key_path" ]; then + # If the token exists, add the volume mount to the Docker command + ssh_key_volume+="-v $ssh_key_path:/root/.ssh/id_rsa" +fi +eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${ssh_key_volume}" "${hf_token_volume}" "${port_forward_arg}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index a509af765c..bb8b4c67f6 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -20,6 +20,7 @@ #include "legion.h" #include #include +#include // #define MAX_SEQ_LEN 1024 // #define BATCH_SIZE 2 @@ -74,6 +75,7 @@ class BatchConfig { static int const MAX_NUM_REQUESTS = 65; static int const MAX_NUM_TOKENS = 1024; static int const MAX_SPEC_TREE_TOKEN_NUM = 64; + static int const MAX_PEFT_CONFIG_SIZE = 1024; // Set by update @@ -89,11 +91,12 @@ class BatchConfig { num_tokens_in_batch = 0; max_length = 0; request_guid = 0; + peft_model_id = PEFTModelID::NO_ID; prompt_phase = false; batch_config_request_id = -1; - peft_model_id = PEFTModelID::NO_ID; peft_bwd = false; optimizer_tasks = {true, false, false, false}; + std::memset(peft_model_config_str, 0, MAX_PEFT_CONFIG_SIZE); } int first_token_depth_in_request; int first_token_offset_in_batch; @@ -106,6 +109,7 @@ class BatchConfig { RequestGuid request_guid; // PEFT fields PEFTModelID peft_model_id; + char peft_model_config_str[MAX_PEFT_CONFIG_SIZE]; bool peft_bwd; OptimizerTasks optimizer_tasks; }; diff --git a/include/flexflow/config.h b/include/flexflow/config.h index dd9d657117..37afa0df27 100644 --- a/include/flexflow/config.h +++ b/include/flexflow/config.h @@ -104,8 +104,6 @@ struct FFHandler { // PEFT related fields MemoryAllocator *peft_activation_allocator; size_t peft_activation_reserve_space_size; - PEFTWeightAllocator *peft_weight_allocator; - size_t peft_weight_reserve_space_size; // Quantization fields DataType quantization_type; bool allowTensorOpMathConversion; @@ -118,7 +116,6 @@ struct FFInitInfo { size_t workSpaceSize; size_t offload_reserve_space_size; size_t peft_activation_reserve_space_size; - size_t peft_weight_reserve_space_size; DataType quantization_type; bool allowTensorOpMathConversion; // int myRank, allRanks; @@ -179,7 +176,6 @@ class FFConfig { // PEFT related fields bool enable_peft; size_t peft_activation_reserve_space_size; - size_t peft_weight_reserve_space_size; // Control parallelizable dimensions bool only_data_parallel; bool enable_sample_parallel; diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h index 3e482b8d67..ebc811c262 100644 --- a/include/flexflow/fftype.h +++ b/include/flexflow/fftype.h @@ -27,6 +27,7 @@ class PEFTModelID { PEFTModelID(size_t id); bool is_valid_id() const; friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs); + friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs); friend std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id); diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 6501b0658c..677f9915cd 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -91,6 +91,8 @@ int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_); int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_); +bool flexflow_config_get_enable_peft(flexflow_config_t handle_); + void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_, int value); @@ -622,7 +624,11 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, bool beam_search, char const *name); -flexflow_peft_model_id_t flexflow_model_add_lora_layer( +void flexflow_model_add_lora_layers(flexflow_model_t handle_, + int num_target_modules, + char const **target_modules_); + +flexflow_peft_model_id_t flexflow_model_register_peft_adapter( flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_); void flexflow_model_set_sgd_optimizer(flexflow_model_t handle, @@ -1023,6 +1029,9 @@ void flexflow_request_manager_set_max_sequence_length( int flexflow_request_manager_get_max_sequence_length( flexflow_request_manager_t handle_); +void flexflow_request_manager_set_max_concurrent_adapters( + flexflow_request_manager_t handle_, int max_concurrent_adapters); + void flexflow_request_manager_set_enable_peft_finetuning( flexflow_request_manager_t handle_, bool enable_peft_finetuning_); diff --git a/include/flexflow/model.h b/include/flexflow/model.h index 51b7950db8..e352159af0 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -278,6 +278,7 @@ enum TaskIDs { RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID, RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID, RM_BACKGROUND_SERVING_TASK_ID, + LOAD_WEIGHT_TASK_ID, // Custom tasks CUSTOM_GPU_TASK_ID_FIRST, CUSTOM_GPU_TASK_ID_1, @@ -835,7 +836,9 @@ class FFModel { // ======================================== // PEFT Layers // ======================================== - PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); + // PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config); + void add_lora_layers(std::vector target_modules); + PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config); // ======================================== // Inference APIs // ======================================== @@ -1170,9 +1173,9 @@ class FFModel { std::vector parameters; // PEFT related std::unordered_map base_layer_to_peft_layer; - std::unordered_map> peft_layer_to_peft_id; - std::unordered_map peft_configs; - // std::vector peft_operators; + // std::unordered_map> + // peft_layer_to_peft_id; std::unordered_map + // peft_configs; std::vector peft_operators; FFHandler handlers[MAX_NUM_WORKERS]; Legion::Future current_metrics; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 007314797a..c108740ef3 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -280,7 +280,7 @@ class Op { // get operator name and print it std::string op_name_without_uid = get_op_name_without_uid(m); std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid - << std::endl; + << (before_kernel ? " (before kernel)" : "") << std::endl; // build the path to save the tensor fs::path dst_filepath; if (fwd_pass) { diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index 90e50a0c9a..aaa845db23 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -61,6 +61,7 @@ void inference_kernel_wrapper(LinearMeta *m, int out_dim, int batch_size); void peft_bwd_kernel_wrapper(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *kernel_ptr, @@ -94,6 +95,7 @@ void forward_kernel(LinearMeta const *m, ffStream_t stream); template void peft_bwd_kernel(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *kernel_ptr, diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h index eee9875d30..fd86dc68c0 100644 --- a/include/flexflow/ops/kernels/lora_linear_kernels.h +++ b/include/flexflow/ops/kernels/lora_linear_kernels.h @@ -6,43 +6,27 @@ #include "flexflow/fftype.h" #include "flexflow/op_meta.h" #include "flexflow/ops/lora_linear.h" +#include "flexflow/utils/peft_weight_allocator.h" namespace FlexFlow { + using Legion::Context; using Legion::Runtime; -struct LoraLinearWeight { - // weights - void *w0_ptr, *w1_ptr; - // gradients - void *w0_grad_ptr, *w1_grad_ptr; - // v values for SGD optimizer (when using momentum) - void *w0_v_values_ptr, *w1_v_values_ptr; - int in_dim, out_dim, rank, num_shards; -}; - -struct LoraLinearModelState { - LoraLinearWeight weights; - LoraOptimizerConfig const *optimizer_config; - float lora_alpha; - std::string cache_folder; - // Huggingface model ID (for download and/or upload) - std::string peft_model_id; -}; class LoraLinearMeta : public OpMeta { public: LoraLinearMeta(FFHandler handle, LoraLinear const *li); ~LoraLinearMeta(void); - // PEFT related fields - void *low_rank_activation; - void *input_activation; - std::unordered_map model_state; - size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; + PEFTMemoryManager *peft_memory_manager; }; namespace Kernels { namespace LoraLinear { -void init_kernel_wrapper(LoraLinearMeta *m, int seed); + +bool lora_applies_to_this_layer(LoraLinearMeta *m, + LoraLinearConfig const &config); + +// void init_kernel_wrapper(LoraLinearMeta *m, int seed); void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input, @@ -51,12 +35,13 @@ void peft_bwd_kernel_wrapper(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad); namespace Internal { -template -void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream); +// template +// void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream); template void inference_kernel(LoraLinearMeta *m, BatchConfig const *bc, @@ -70,6 +55,7 @@ void peft_bwd_kernel(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, DT *input_grad_ptr, DT const *output_grad_ptr, int in_dim, diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h index 9e83c3f90e..cc625cafc2 100644 --- a/include/flexflow/ops/lora_linear.h +++ b/include/flexflow/ops/lora_linear.h @@ -17,14 +17,13 @@ class LoraLinear : public Op { using Params = LoraLinearParams; using Input = std::pair; - LoraLinear( - FFModel &model, - LayerID const &layer_guid, - OperatorType type, - ParallelTensor const input, - ParallelTensor const output, - std::unordered_map const &_peft_configs, - char const *name = nullptr); + LoraLinear(FFModel &model, + LayerID const &layer_guid, + ParallelTensor const input, + ParallelTensor const output, + int max_rank, + int max_concurrent_adapters, + char const *name = nullptr); LoraLinear(FFModel &model, LoraLinear const &other, ParallelTensor const input, @@ -91,7 +90,9 @@ class LoraLinear : public Op { // size_t get_params_hash() const override; LoraLinearParams get_params() const; - std::unordered_map peft_configs; + // std::unordered_map peft_configs; + int max_rank; + int max_concurrent_adapters; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 70539271f2..46b88c9690 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -17,6 +17,9 @@ namespace FlexFlow { class LoraOptimizerConfig { public: LoraOptimizerConfig(); + virtual std::string getType() const = 0; + virtual nlohmann::json toJson() const = 0; + static LoraOptimizerConfig *fromJson(nlohmann::json const &j); virtual ~LoraOptimizerConfig() {} }; @@ -29,9 +32,11 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig { bool weight_decay_ = 0.0f); friend std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc); - - NLOHMANN_DEFINE_TYPE_INTRUSIVE( - LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay) + std::string getType() const override { + return "SGD"; + } + nlohmann::json toJson() const override; + static LoraSGDOptimizerConfig *fromJson(nlohmann::json const &j); public: double lr = 0.001f; @@ -51,8 +56,11 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { friend std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc); - NLOHMANN_DEFINE_TYPE_INTRUSIVE( - LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon) + std::string getType() const override { + return "Adam"; + } + nlohmann::json toJson() const override; + static LoraAdamOptimizerConfig *fromJson(nlohmann::json const &j); public: // Adam @@ -63,14 +71,6 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { double epsilon = 1e-8; }; -// Serialization helpers -template -void serialize_to_json_file(T const &obj, fs::path const &filepath); - -// Function to deserialize JSON from file and create object -template -std::unique_ptr deserialize_from_json_file(fs::path const &filepath); - class LoraLinearConfig { public: static const LoraLinearConfig EmptyConfig; @@ -92,17 +92,14 @@ class LoraLinearConfig { friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); - NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig, - cache_folder, - peft_model_id, - rank, - lora_alpha, - lora_dropout, - target_modules, - trainable, - init_lora_weights, - base_model_name_or_path, - precision) + std::string serialize_to_json_string(int indent = -1) const; + void serialize_to_json_file(std::string const &filename) const; + // Deserialization method + static LoraLinearConfig + deserialize_from_json_string(std::string const &json_string); + // Deserialization method + static LoraLinearConfig + deserialize_from_json_file(std::string const &filename); std::string cache_folder; // Huggingface model ID (for download and/or upload) @@ -128,8 +125,8 @@ class LoraLinearConfig { class LoraLinearParams { public: LayerID layer_guid; - OperatorType type; - std::unordered_map peft_configs; + int max_rank; + int max_concurrent_adapters; char name[MAX_OPNAME]; bool is_valid(std::pair const @@ -147,4 +144,4 @@ struct hash { }; } // namespace std -#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H +#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H \ No newline at end of file diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index d62b610f3d..c15c0ff8b4 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -150,6 +150,13 @@ class RequestManager { std::vector eos_token_ids, std::string const &path); void register_output_filepath(std::string const &); + void set_peft_config(PEFTModelID const &peft_model_id, + LoraLinearConfig const &peft_config); + LoraLinearConfig const &get_peft_config(PEFTModelID const &peft_model_id); + void set_max_lora_rank(int max_lora_rank); + void set_max_concurrent_adapters(int max_concurrent_adapters); + int get_max_lora_rank(); + int get_max_concurrent_adapters(); void initBitMask(BatchConfig::BitMask &bitmask, int initLength); void appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength); void appendBitMask(BatchConfig::BitMask &bitmask, @@ -182,6 +189,9 @@ class RequestManager { bool is_eos_token(int token_id); bool check_inf_req_completion(BatchConfig const &old_bc, int i); void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc); + void add_peft_config_to_request_info(BatchConfig &bc, + int req_idx, + LoraLinearConfig const &peft_config); BatchConfig prepare_next_batch(BatchConfig const &bc, InferenceResult const &result); BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc, @@ -291,6 +301,10 @@ class RequestManager { int max_sequence_length; Status request_manager_status; + // peft + std::unordered_map peft_configs; + int max_lora_rank = 32; + int max_concurrent_adapters = 0; // peft benchmarking bool enable_peft_finetuning = false; static bool inference_finished; diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h index 646eb18da2..8735f23571 100644 --- a/include/flexflow/utils/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -39,7 +39,13 @@ class FileDataLoader { void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); - void load_weights(FFModel *ff); + + static void + load_weight_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); + void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime); void load_positions(FFModel *ff, Tensor pt, @@ -54,3 +60,18 @@ class FileDataLoader { std::string weights_folder; bool use_full_precision; }; + +struct WeightLoadTaskArgs { + FFModel *ff; + FileDataLoader *loader; + Layer *layer; + int weight_idx; + DataType data_type; + WeightLoadTaskArgs(FFModel *_ff, + FileDataLoader *_loader, + Layer *_l, + int _idx, + DataType _data_type) + : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), + data_type(_data_type) {} +}; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index dae46a8af1..21ac9bf426 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -17,76 +17,121 @@ #define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ #include "flexflow/config.h" -#include +#include "flexflow/ffconst_utils.h" +#include "flexflow/ops/lora_linear_params.h" +// #include namespace FlexFlow { -class PEFTWeightAllocator { -public: - PEFTWeightAllocator(void *_base_ptr, size_t _total_size) - : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0), - local_offset(_total_size) {} +struct LoraLinearWeight { + // weights + void *w0_ptr, *w1_ptr; + // gradients + void *w0_grad_ptr, *w1_grad_ptr; + // activations + void *input_activation; + void *low_rank_activation; + // v values for SGD optimizer (when using momentum) + void *w0_v_values_ptr, *w1_v_values_ptr; + LoraLinearWeight(void *w0 = nullptr, + void *w1 = nullptr, + void *w0_grad = nullptr, + void *w1_grad = nullptr, + void *w0_v_values = nullptr, + void *w1_v_values = nullptr, + void *low_rank_activation_ = nullptr, + void *input_activation_ = nullptr) + : w0_ptr(w0), w1_ptr(w1), w0_grad_ptr(w0_grad), w1_grad_ptr(w1_grad), + w0_v_values_ptr(w0_v_values), w1_v_values_ptr(w1_v_values), + low_rank_activation(low_rank_activation_), + input_activation(input_activation_) {} +}; - inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id, - size_t datalen) { - const std::lock_guard lock(peft_weight_allocator_mutex); - void *ptr = static_cast(base_ptr) + sync_offset; - off_t model_sync_weights_offset = sync_offset; - size_t model_sync_weights_size = datalen; - if (sync_weights.find(peft_model_id) != sync_weights.end()) { - // Assert that sync weights for each PEFT model is consecutive - std::pair offset_and_size = sync_weights[peft_model_id]; - assert(sync_offset == offset_and_size.first + offset_and_size.second); - model_sync_weights_offset = offset_and_size.first; - model_sync_weights_size = offset_and_size.second + datalen; - } - sync_offset += datalen; - assert(sync_offset < local_offset); - sync_weights[peft_model_id] = - std::make_pair(model_sync_weights_offset, model_sync_weights_size); - return ptr; - } +void init_peft_weight_wrapper(LoraLinearWeight const &weight, + int in_dim, + int out_dim, + int rank, + DataType dt, + int seed); - std::pair - get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) { - const std::lock_guard lock(peft_weight_allocator_mutex); - assert(sync_weights.find(peft_model_id) != sync_weights.end()); - std::pair offset_and_size = sync_weights[peft_model_id]; - return std::make_pair(static_cast(base_ptr) + offset_and_size.first, - offset_and_size.second); +class PEFTMemoryManager { +public: + PEFTMemoryManager(Legion::Memory gpu_mem_, + int max_rank_, + int max_concurrent_adapters_, + int max_peft_tokens_, + int in_dim_, + int out_dim_, + int num_shards_, + int shard_id_, + std::string const &lora_layername_substr_, + DataType dt_) + : gpu_mem(gpu_mem_), max_concurrent_adapters(max_concurrent_adapters_), + max_rank(max_rank_), in_dim(in_dim_), out_dim(out_dim_), + num_shards(num_shards_), shard_id(shard_id_), + max_peft_tokens(max_peft_tokens_), + lora_layername_substr(lora_layername_substr_), dt(dt_), + base_ptr(nullptr), finetuning_ptr(nullptr), + finetuning_model_id(PEFTModelID::NO_ID) { + max_lora_size = + data_type_size(dt) * (max_rank * in_dim + max_rank * out_dim); + assert(max_concurrent_adapters > 0 && + "PEFT Memory Manager max_concurrent_adapters must be > 0"); + assert(max_lora_size > 0 && + "PEFT Memory Manager max_lora_size must be > 0"); + allocate_inference_memory(); + // finetuning memory is allocated upon the first finetuning request, so we + // can skip for inference-only workloads } - inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id, - size_t datalen) { - const std::lock_guard lock(peft_weight_allocator_mutex); - local_offset -= datalen; - assert(sync_offset < local_offset); - void *ptr = static_cast(base_ptr) + local_offset; - return ptr; - } + // allocate memory for all the PEFT adapters for a given layer on a given + // shard + void allocate_inference_memory(); + // allocate memory for the PEFT adapter for a finetuning request for a given + // layer and shard + void allocate_finetuning_memory(); - template - inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id, - size_t count) { - return static_cast
( - allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count)); - } + LoraLinearWeight get_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config); + void check_ft_model_id(PEFTModelID const &model_id); - template - inline DT *allocate_local_weights(PEFTModelID const &peft_model_id, - size_t count) { - return static_cast
( - allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count)); - } +private: + // Check if the PEFT adapter for the given model is in memory. If not, sets + // the cache_miss flag to true. If this is the first finetuning request, + // allocate memory for the finetuning adapter. + void get_finetuning_slot(PEFTModelID const &model_id, bool *cache_miss); + // Returns the slot in memory where the peft model weights are/will be stored. + // If the model is not in memory (cache miss), set the cache_miss flag to + // true. + int get_inference_peft_slot(PEFTModelID const &model_id, bool *cache_miss); + void load_peft_model(LoraLinearWeight &weight, + LoraLinearConfig const &lora_config); + LoraLinearWeight get_inference_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config); + LoraLinearWeight get_finetuning_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config); -public: - void *base_ptr; - size_t total_size; - off_t sync_offset, local_offset; - std::unordered_map> sync_weights; - std::mutex peft_weight_allocator_mutex; + // Legion memory management apparatus + Legion::Memory gpu_mem; + Realm::RegionInstance peftLegionInst; + void *base_ptr, *finetuning_ptr; + // Size and shapes + int max_concurrent_adapters; + int max_rank; + int max_lora_size; + int in_dim, out_dim, num_shards, shard_id; + int max_peft_tokens; + // LRU cache apparatus + std::unordered_map lru_hashtable; + std::vector + lru_list; // head = least recently used, tail=most recently used + std::unordered_map peft2mem_slot; + // Miscellanea + std::string lora_layername_substr; + DataType dt; + PEFTModelID finetuning_model_id; }; -}; // namespace FlexFlow +} // namespace FlexFlow #endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_ diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc index fd4da87b99..b4f961b006 100644 --- a/inference/models/falcon.cc +++ b/inference/models/falcon.cc @@ -269,6 +269,14 @@ void FALCON::create_falcon_model(FFModel &ff, output = ff.argmax(lm_head, /*beam_Search*/ false); } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"dense_h_to_4h", + "dense_4h_to_h"}; + ff.add_lora_layers(target_modules); + } + FileDataLoader *fileloader = new FileDataLoader("", weight_file_path, diff --git a/inference/models/llama.cc b/inference/models/llama.cc index bd5243bd4b..7b4a14b472 100644 --- a/inference/models/llama.cc +++ b/inference/models/llama.cc @@ -250,9 +250,6 @@ void LLAMA::create_llama_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str()); - // Low-Rank Adapter (LoRA) for the second linear layer - // ff.lora_linear(std::string("down_proj"), std::string("layers." + - // std::to_string(i) + ".mlp.down_proj.lora").c_str()); } // final normalization and linear Tensor final_rms_norm_output[2] = {nullptr, nullptr}; @@ -297,6 +294,14 @@ void LLAMA::create_llama_model(FFModel &ff, } } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = { + "gate_proj", "up_proj", "down_proj"}; + ff.add_lora_layers(target_modules); + } + FileDataLoader *fileloader = new FileDataLoader( "", weight_file_path, diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc index d02c0f3b82..6807266ef4 100644 --- a/inference/models/mpt.cc +++ b/inference/models/mpt.cc @@ -272,6 +272,14 @@ void MPT::create_mpt_model(FFModel &ff, } else { output = ff.argmax(lm_head, /*beam_Search*/ false); } + + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"up_proj", "down_proj"}; + ff.add_lora_layers(target_modules); + } + FileDataLoader *fileloader = new FileDataLoader("", weight_file_path, diff --git a/inference/models/opt.cc b/inference/models/opt.cc index 34a6bb0f02..cb3d5290cf 100644 --- a/inference/models/opt.cc +++ b/inference/models/opt.cc @@ -243,9 +243,6 @@ void OPT::create_opt_model(FFModel &ff, REG_MODE_NONE, 0.0f, std::string("layers." + std::to_string(i) + ".fc2").c_str()); - // Low-Rank Adapter (LoRA) for the second linear layer - // ff.lora_linear(std::string("fc2"), std::string("layers." + - // std::to_string(i) + ".fc2.lora").c_str()); } // final @@ -286,6 +283,13 @@ void OPT::create_opt_model(FFModel &ff, output = ff.argmax(softmax, /*beam_Search*/ false); } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"fc1", "fc2"}; + ff.add_lora_layers(target_modules); + } + FileDataLoader *fileloader = new FileDataLoader( "", weight_file_path, diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc index 2429b1ec1b..3dd61be983 100644 --- a/inference/models/starcoder.cc +++ b/inference/models/starcoder.cc @@ -253,6 +253,13 @@ void STARCODER::create_starcoder_model( } } + // If PEFT is enabled, add LoRA layers + if (ff.config.enable_peft) { + // todo: add attention projections + std::vector target_modules = {"c_fc", "c_proj"}; + ff.add_lora_layers(target_modules); + } + InferenceManager *im = InferenceManager::get_inference_manager(); FileDataLoader *fileloader = new FileDataLoader( "", diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 0ab0b62ee8..4f2d47055a 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -256,7 +256,7 @@ void FlexFlow::top_level_task(Task const *task, LoraOptimizerConfig *optim_config = nullptr; if (enable_peft_finetuning) { // float sgd_learning_rate = 2e-1; - float sgd_learning_rate = 1.0f; + float sgd_learning_rate = 0.001f; optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate); } LoraLinearConfig peft_config_finetuning = @@ -275,6 +275,8 @@ void FlexFlow::top_level_task(Task const *task, rm->set_max_requests_per_batch( max_requests_per_batch + (int)enable_peft_finetuning); // add one slot for finetuning if needed + rm->set_max_concurrent_adapters(max_requests_per_batch + + (int)enable_peft_finetuning); rm->set_max_tokens_per_batch(max_tokens_per_batch); rm->set_max_sequence_length(max_sequence_length); rm->register_tokenizer( @@ -320,18 +322,19 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } - // Add PEFT layer + // Start background server + rm->start_background_server(&model); + + // Add PEFT adapter(s) PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); if (enable_peft_finetuning) { - peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning); + peft_model_id_finetuning = + model.register_peft_adapter(peft_config_finetuning); } } - // Start background server - rm->start_background_server(&model); - // Run workload { std::vector requests; diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc index 85e97ec4e8..9da4fa1994 100644 --- a/inference/peft/peft_bwd_benchmark.cc +++ b/inference/peft/peft_bwd_benchmark.cc @@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + // Start background server + rm->start_background_server(&model); + // Add PEFT layer PEFTModelID *peft_model_id = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); } - // Start background server - rm->start_background_server(&model); - // Warmup stage { std::vector requests; diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc index 87322a42dd..3274f2e535 100644 --- a/inference/peft/peft_fwd_benchmark.cc +++ b/inference/peft/peft_fwd_benchmark.cc @@ -304,15 +304,15 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + // Start background server + rm->start_background_server(&model); + // Add PEFT layer PEFTModelID *peft_model_id = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); } - // Start background server - rm->start_background_server(&model); - // Run workload { std::vector requests; diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc index ffa77478e1..8a94f6e68b 100644 --- a/inference/peft/req_rate_benchmark.cc +++ b/inference/peft/req_rate_benchmark.cc @@ -366,14 +366,14 @@ void FlexFlow::top_level_task(Task const *task, assert(false && "unknow model type"); } + rm->start_background_server(&model); + // Add PEFT layer PEFTModelID *peft_model_id = nullptr; if (!peft_model_name.empty()) { - peft_model_id = model.add_lora_layer(peft_config); + peft_model_id = model.register_peft_adapter(peft_config); } - rm->start_background_server(&model); - // Warmup stage { std::vector requests; diff --git a/inference/python/chat.py b/inference/python/chat.py index 13ece116a6..95132443a2 100644 --- a/inference/python/chat.py +++ b/inference/python/chat.py @@ -21,14 +21,14 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 1, - "memory_per_gpu": 30000, - "zero_copy_memory_per_node": 60000, + "num_gpus": 8, + "memory_per_gpu": 34000, + "zero_copy_memory_per_node": 200000, # optional parameters - "num_cpus": 4, - "legion_utility_processors": 4, + "num_cpus": 16, + "legion_utility_processors": 16, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 1, + "tensor_parallelism_degree": 8, "pipeline_parallelism_degree": 1, "offload": False, "offload_reserve_space_size": 8 * 1024, # 8GB @@ -36,7 +36,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -44,7 +43,7 @@ def get_configs(): } llm_configs = { # required parameters - "llm_model": "meta-llama/Meta-Llama-3-8B-Instruct", + "llm_model": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", # optional parameters "cache_path": os.environ.get("FF_CACHE_PATH", ""), "refresh_cache": False, @@ -86,11 +85,15 @@ def main(): llm.start_server() + nemotron_system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature." + llama_generic_system = "You are a helpful an honest programming assistant." + + messages=[ - {"role": "system", "content": "You are a helpful an honest programming assistant."}, + {"role": "system", "content": nemotron_system}, {"role": "user", "content": "Is Rust better than Python?"}, ] - llm.generate(messages, max_new_tokens=256) + llm.generate(messages, max_new_tokens=1024) llm.stop_server() diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py index 13da7aee20..0167cecebc 100644 --- a/inference/python/ff_peft.py +++ b/inference/python/ff_peft.py @@ -41,14 +41,14 @@ def get_configs(): # Define sample configs ff_init_configs = { # required parameters - "num_gpus": 2, + "num_gpus": 4, "memory_per_gpu": 14000, "zero_copy_memory_per_node": 10000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, "data_parallelism_degree": 1, - "tensor_parallelism_degree": 2, + "tensor_parallelism_degree": 4, "pipeline_parallelism_degree": 1, "offload": False, "offload_reserve_space_size": 8 * 1024, # 8GB @@ -56,7 +56,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": True, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "inference_debugging": True, "fusion": False, @@ -103,6 +102,23 @@ def main(): refresh_cache=configs.refresh_cache, output_file=configs.output_file, ) + + # Compile the LLM for inference and load the weights into memory + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + enable_peft_finetuning = len(configs.finetuning_dataset) > 0 + llm.compile( + generation_config, + max_requests_per_batch=1 if not enable_peft_finetuning else 2, + max_seq_length=256, + max_tokens_per_batch=128, + max_concurrent_adapters=1 if not enable_peft_finetuning else 2, + enable_peft_finetuning=enable_peft_finetuning, + ) + + llm.start_server() + # Add inference and/or finetuning lora lora_inference_config = None lora_finetuning_config = None @@ -112,18 +128,8 @@ def main(): configs.inference_peft_model_id, base_model_name_or_path=configs.base_model, ) - llm.add_peft(lora_inference_config) + llm.register_peft_adapter(lora_inference_config) if len(configs.finetuning_dataset) > 0: - # lora_finetuning_config = ff.LoraLinearConfig( - # llm.cache_path, - # configs.finetuning_peft_model_id, - # target_modules=["down_proj"], - # rank=16, - # lora_alpha=16, - # trainable=True, - # init_lora_weights=True, - # optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD, - # ) lora_finetuning_config = ff.LoraLinearConfig( llm.cache_path, configs.inference_peft_model_id, @@ -137,22 +143,7 @@ def main(): "nesterov": False, }, ) - llm.add_peft(lora_finetuning_config) - - # Compile the LLM for inference and load the weights into memory - generation_config = ff.GenerationConfig( - do_sample=False, temperature=0.9, topp=0.8, topk=1 - ) - enable_peft_finetuning = len(configs.finetuning_dataset) > 0 - llm.compile( - generation_config, - enable_peft_finetuning=enable_peft_finetuning, - max_requests_per_batch=1 if not enable_peft_finetuning else 2, - max_seq_length=256, - max_tokens_per_batch=128, - ) - - llm.start_server() + llm.register_peft_adapter(lora_finetuning_config) requests = [] # Serving diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 232ef1699c..4bb6892a6b 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -56,7 +56,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md index 9b2a7a53b2..0f78efdea9 100644 --- a/inference/python/peft_demo/INSTRUCTIONS.md +++ b/inference/python/peft_demo/INSTRUCTIONS.md @@ -13,7 +13,7 @@ * `export HUGGINGFACE_TOKEN="[Your token]"` * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"` - * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"` + * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full"` * Run the demo ``` diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb index dfb5193a1d..ea2b8417b6 100644 --- a/inference/python/peft_demo/demo.ipynb +++ b/inference/python/peft_demo/demo.ipynb @@ -91,7 +91,6 @@ " \"use_8bit_quantization\": False,\n", " \"enable_peft\": True,\n", " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", - " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", " \"profiling\": False,\n", " \"inference_debugging\": False,\n", " \"fusion\": False,\n", @@ -195,7 +194,7 @@ } ], "source": [ - "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n", + "args = [configs.inference_peft_model_id]\n", "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)" ] }, @@ -1773,7 +1772,6 @@ " \"use_8bit_quantization\": False,\n", " \"enable_peft\": True,\n", " \"peft_activation_reserve_space_size\": 1024, # 1GB\n", - " \"peft_weight_reserve_space_size\": 1024, # 1GB\n", " \"profiling\": False,\n", " \"inference_debugging\": False,\n", " \"fusion\": False,\n", @@ -1815,7 +1813,7 @@ "configs = SimpleNamespace(**configs_dict)\n", "\n", "\n", - "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n", + "args = [configs.finetuning_peft_model_id+\"-dolly\"]\n", "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n", "\n", "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n", diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py index 9e01b4645b..b70f3c8966 100644 --- a/inference/python/peft_demo/demo.py +++ b/inference/python/peft_demo/demo.py @@ -47,7 +47,6 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data "use_8bit_quantization": False, "enable_peft": True, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "inference_debugging": False, "fusion": False, @@ -99,7 +98,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data file.write('') # Download base and peft inference models -args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model] +args = [configs.inference_peft_model_id] # hf_token = input("Please enter your HuggingFace personal access token: ") # subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) subprocess.run(['python', '../../utils/download_peft_model.py'] + args) @@ -207,7 +206,7 @@ def create_datasets(finetune_dataset_size=2, inference_file_path='inference_data ) llm.add_peft(lora_inference_config) -args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model] +args = [configs.finetuning_peft_model_id] #hf_token = input("Please enter your HuggingFace personal access token: ") # subprocess.run(['huggingface-cli', 'login', '--token', hf_token]) # subprocess.run(['python', '../../utils/download_peft_model.py'] + args) diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 7ae752cffc..8cf96c1eba 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -56,7 +56,6 @@ def get_configs(): "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, diff --git a/inference/python/streamlit/README.md b/inference/python/streamlit/README.md new file mode 100644 index 0000000000..86a15e2d6d --- /dev/null +++ b/inference/python/streamlit/README.md @@ -0,0 +1,18 @@ +# Streamlit demo + +## Instructions + +1. Build and install FlexFlow, or build and run `source ./set_python_envs.sh` from the build folder +2. Edit the FlexFlow/inference/python/streamlit/fastapi_incr.py to configure the model to run and the system configs (num gpus, amount of memory, etc) +3. In one terminal, launch the LLM engine with the commands below, and wait until the model's weights loading completes +``` +cd FlexFlow/inference/python/streamlit +python fastapi_incr.py +``` +4. In another terminal, launch the streamlit app: +``` +cd FlexFlow/inference/python/streamlit +streamlit run app.py +``` +5. Open the URL printed to the terminal, e.g. `http://localhost:8501` and interact with the app via browser + diff --git a/inference/python/streamlit/app.py b/inference/python/streamlit/app.py new file mode 100644 index 0000000000..9788765a3a --- /dev/null +++ b/inference/python/streamlit/app.py @@ -0,0 +1,188 @@ +import streamlit as st +import requests +import os, json +from huggingface_hub import model_info + + +# App title +st.set_page_config(page_title="🚀💻 FlexLLM Server", layout="wide") + +# FastAPI server URL +FASTAPI_URL = "http://localhost:8000/chat/completions" # Adjust the port if necessary +FINETUNE_URL = "http://localhost:8000/finetuning" + +# Initialize session state variables +if 'added_adapters' not in st.session_state: + st.session_state.added_adapters = [] + +# Store LLM generated responses +if "messages" not in st.session_state.keys(): + st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] + +def check_model_availability(model_name): + try: + info = model_info(model_name) + return True + except Exception: + return False + +def clear_chat_history(): + st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}] + +# Function for generating LLaMA2 response +def generate_llama3_response(prompt_input): + system_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature." + + # Send request to FastAPI server + response = requests.post(FASTAPI_URL, json={"max_new_tokens": 1024, "messages": [{"role": "system", "content": system_prompt}] + st.session_state.messages + [{"role": "user", "content": prompt_input}]}) + + if response.status_code == 200: + return response.json()["response"] + else: + return f"Error: {response.status_code} - {response.text}" + +# Sidebar +with st.sidebar: + st.title('🚀 FlexLLM Server') + page = st.radio("Choose a page", ["Chat", "Finetune"]) + if page == "Chat": + st.header('🦙 Llama Chatbot') + # st.success('Using local FastAPI server', icon='✅') + st.sidebar.button('Clear Chat History', on_click=clear_chat_history) + + st.subheader('Generation parameters') + max_length = st.sidebar.slider('Max generation length', min_value=64, max_value=2048, value=1024, step=8) + # selected_model = st.sidebar.selectbox('Choose a Llama2 model', ['Llama2-7B', 'Llama2-13B', 'Llama2-70B'], key='selected_model') + decoding_method = st.sidebar.selectbox('Decoding method', ['Greedy decoding (default)', 'Sampling'], key='decoding_method') + temperature = st.sidebar.slider('temperature', min_value=0.01, max_value=5.0, value=0.1, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') + top_p = st.sidebar.slider('top_p', min_value=0.01, max_value=1.0, value=0.9, step=0.01, disabled=decoding_method == 'Greedy decoding (default)') + + # lora_adapter = st.sidebar.text_input('Lora adapter', placeholder='None') + st.subheader("LoRA Adapters (optional)") + # Text input for PEFT model ID + peft_id = st.text_input("Add a LoRA Adapter", placeholder="Enter the Huggingface PEFT model ID") + # Button to load the adapter + if st.button("Load Adapter"): + if peft_id: + with st.spinner("Checking PEFT availability..."): + is_available = check_model_availability(peft_id) + if is_available: + if peft_id not in st.session_state.added_adapters: + st.session_state.added_adapters.append(peft_id) + st.success(f"Successfully added PEFT: {peft_id}") + else: + st.warning(f"PEFT {peft_id} is already in the list.") + else: + st.error(f"PEFT {peft_id} is not available on Hugging Face. Please check the ID and try again.") + else: + st.warning("Please enter a PEFT Model ID.") + # Button to remove all adapters + if st.button("Remove All Adapters"): + st.session_state.added_adapters = [] + st.success("All adapters have been removed.") + # Display the list of added adapters + st.markdown("**Added Adapters:**") + if st.session_state.added_adapters: + for adapter in st.session_state.added_adapters: + st.write(f"- {adapter}") + else: + st.write("No adapters added yet.") + # st.markdown('📖 Learn how to build this app in this [blog](https://blog.streamlit.io/how-to-build-a-llama-2-chatbot/)!') + elif page == "Finetune": + st.header("🏋️‍♂️ LoRA Finetuning") + + # Hugging Face token input + # hf_token = st.text_input("Enter your Hugging Face token:", type="password") + if 'hf_token' in st.session_state.keys(): + st.success('HF token already provided!', icon='✅') + hf_token = st.session_state.hf_token + else: + hf_token = st.text_input('Enter your Hugging Face token:', type='password') + if not (hf_token.startswith('hf_') and len(hf_token)==37): + st.warning('please enter a valid token', icon='⚠️') + else: + st.success('Proceed to finetuning your model!', icon='👉') + st.session_state.hf_token = hf_token + + # PEFT model name + peft_model_name = st.text_input("Enter the PEFT model name:", help="The name of the PEFT model should start with the username associated with the provided HF token, followed by '/'ß. E.g. 'username/peft-base-uncased'") + + # Dataset selection + dataset_option = st.radio("Choose dataset source:", ["Upload JSON", "Hugging Face Dataset"]) + + if dataset_option == "Upload JSON": + uploaded_file = st.file_uploader("Upload JSON dataset", type="json") + if uploaded_file is not None: + dataset = json.load(uploaded_file) + st.success("Dataset uploaded successfully!") + else: + dataset_name = st.text_input("Enter Hugging Face dataset name:") + + # Finetuning parameters + st.subheader("Finetuning parameters") + lora_rank = st.number_input("LoRA rank", min_value=2, max_value=64, value=16, step=2) + lora_alpha = st.number_input("LoRA alpha", min_value=2, max_value=64, value=16, step=2) + target_modules = st.multiselect("Target modules", ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"], default=["down_proj"]) + learning_rate = st.number_input("Learning rate", min_value=1e-6, max_value=1e-3, value=1e-5, step=1e-6) + optimizer_type = st.selectbox("Optimizer type", ["SGD", "Adam", "AdamW", "Adagrad", "Adadelta", "Adamax", "RMSprop"]) + momentum = st.number_input("Momentum", min_value=0.0, max_value=1.0, value=0.0, step=0.01) + weight_decay = st.number_input("Weight decay", min_value=0.0, max_value=1.0, value=0.0, step=0.01) + nesterov = st.checkbox("Nesterov") + max_steps = st.number_input("Max steps", min_value=1000, max_value=100000, value=10000, step=1000) + + # Start finetuning button + if st.button("Start Finetuning"): + if not hf_token: + st.error("Please enter your Hugging Face token.") + elif dataset_option == "Upload JSON" and uploaded_file is None: + st.error("Please upload a JSON dataset.") + elif dataset_option == "Hugging Face Dataset" and not dataset_name: + st.error("Please enter a Hugging Face dataset name.") + else: + # Prepare the request data + request_data = { + "token": hf_token, + "dataset_source": dataset_option, + } + + if dataset_option == "Upload JSON": + request_data["dataset"] = dataset + else: + request_data["dataset_name"] = dataset_name + + # Send finetuning request to FastAPI server + with st.spinner("Finetuning in progress..."): + response = requests.post(FINETUNE_URL, json=request_data) + + if response.status_code == 200: + st.success("Finetuning completed successfully!") + else: + st.error(f"Finetuning failed. Error: {response.status_code} - {response.text}") + +if page == "Chat": + # Display or clear chat messages + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.write(message["content"]) + + # User-provided prompt + if prompt := st.chat_input(): + st.session_state.messages.append({"role": "user", "content": prompt}) + with st.chat_message("user"): + st.write(prompt) + + # Generate a new response if last message is not from assistant + if st.session_state.messages[-1]["role"] != "assistant": + with st.chat_message("assistant"): + with st.spinner("Running..."): + response = generate_llama3_response(prompt) + placeholder = st.empty() + full_response = '' + for item in response: + full_response += item + placeholder.markdown(full_response) + placeholder.markdown(full_response) + message = {"role": "assistant", "content": full_response} + st.session_state.messages.append(message) +elif page == "Finetune": + st.write("Use the sidebar to configure and start finetuning.") \ No newline at end of file diff --git a/inference/python/streamlit/fastapi_incr.py b/inference/python/streamlit/fastapi_incr.py new file mode 100644 index 0000000000..6ac7f4149a --- /dev/null +++ b/inference/python/streamlit/fastapi_incr.py @@ -0,0 +1,207 @@ +# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Running Instructions: +- To run this FastAPI application, make sure you have FastAPI and Uvicorn installed. +- Save this script as 'fastapi_incr.py'. +- Run the application using the command: `uvicorn fastapi_incr:app --reload --port PORT_NUMBER` +- The server will start on `http://localhost:PORT_NUMBER`. Use this base URL to make API requests. +- Go to `http://localhost:PORT_NUMBER/docs` for API documentation. +""" + + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field +import flexflow.serve as ff +import uvicorn +import json, os, argparse +from types import SimpleNamespace +from typing import Optional, List +import time + + +# Initialize FastAPI application +app = FastAPI() + +# Define the request model +class PromptRequest(BaseModel): + prompt: str + +# data models +class Message(BaseModel): + role: str + content: str + + +# class ChatCompletionRequest(BaseModel): +# model: Optional[str] = "mock-gpt-model" +# messages: List[Message] +# max_tokens: Optional[int] = 512 +# temperature: Optional[float] = 0.1 +# stream: Optional[bool] = False + +class ChatCompletionRequest(BaseModel): + max_new_tokens: Optional[int] = 1024 + messages: List[Message] + +# Global variable to store the LLM model +llm = None + + +def get_configs(): + + # Fetch configuration file path from environment variable + config_file = os.getenv("CONFIG_FILE", "") + + # Load configs from JSON file (if specified) + if config_file: + if not os.path.isfile(config_file): + raise FileNotFoundError(f"Config file {config_file} not found.") + try: + with open(config_file) as f: + return json.load(f) + except json.JSONDecodeError as e: + print("JSON format error:") + print(e) + else: + # Define sample configs + ff_init_configs = { + # required parameters + "num_gpus": 8, + "memory_per_gpu": 20000, + "zero_copy_memory_per_node": 40000, + # optional parameters + "num_cpus": 4, + "legion_utility_processors": 8, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 4, + "pipeline_parallelism_degree": 1, + "offload": False, + "offload_reserve_space_size": 8 * 1024, # 8GB + "use_4bit_quantization": False, + "use_8bit_quantization": False, + "enable_peft": False, + "peft_activation_reserve_space_size": 1024, # 1GB + "profiling": False, + "benchmarking": False, + "inference_debugging": False, + "fusion": True, + } + llm_configs = { + # required parameters + "llm_model": "meta-llama/Llama-3.1-8B-Instruct", + # optional parameters + "cache_path": os.environ.get("FF_CACHE_PATH", ""), + "refresh_cache": False, + "full_precision": False, + "prompt": "", + "output_file": "", + } + # Merge dictionaries + ff_init_configs.update(llm_configs) + return ff_init_configs + + +# Initialize model on startup +@app.on_event("startup") +async def startup_event(): + global llm + + # Initialize your LLM model configuration here + configs_dict = get_configs() + configs = SimpleNamespace(**configs_dict) + ff.init(configs_dict) + + ff_data_type = ( + ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF + ) + llm = ff.LLM( + configs.llm_model, + data_type=ff_data_type, + cache_path=configs.cache_path, + refresh_cache=configs.refresh_cache, + output_file=configs.output_file, + ) + + generation_config = ff.GenerationConfig( + do_sample=False, temperature=0.9, topp=0.8, topk=1 + ) + llm.compile( + generation_config, + max_requests_per_batch=16, + max_seq_length=2048, + max_tokens_per_batch=1024, + ) + llm.start_server() + +# API endpoint to generate response +@app.post("/generate/") +async def generate(prompt_request: PromptRequest): + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + # Call the model to generate a response + full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8') + + # Separate the prompt and response + split_output = full_output.split('\n', 1) + if len(split_output) > 1: + response_text = split_output[1] + else: + response_text = "" + + # Return the prompt and the response in JSON format + return { + "prompt": prompt_request.prompt, + "response": response_text + } + +@app.post("/chat/completions") +async def chat_completions(request: ChatCompletionRequest): + + if llm is None: + raise HTTPException(status_code=503, detail="LLM model is not initialized.") + + print("received request:", request) + result = llm.generate([message.dict() for message in request.messages], max_new_tokens=request.max_new_tokens)[0].output_text.decode('utf-8') + print("returning response:", result) + return { + "response": result + } + return { + "id": "1337", + "object": "chat.completion", + "created": time.time(), + "model": request.model, + "choices": [{"message": Message(role="assistant", content=resp_content)}], + } + +# Shutdown event to stop the model server +@app.on_event("shutdown") +async def shutdown_event(): + global llm + if llm is not None: + llm.stop_server() + +# Main function to run Uvicorn server +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) + +# Running within the entrypoint folder: +# uvicorn fastapi_incr:app --reload --port + +# Running within the python folder: +# uvicorn entrypoint.fastapi_incr:app --reload --port 3000 diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py index 38dd577574..2ee63b10bc 100644 --- a/inference/utils/download_peft_model.py +++ b/inference/utils/download_peft_model.py @@ -1,13 +1,11 @@ #!/usr/bin/env python import flexflow.serve as ff import argparse, os +from peft import PeftConfig def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument( - "--base_model_name", type=str, help="Name of the model to download" - ) parser.add_argument( "peft_model_ids", type=str, @@ -48,19 +46,21 @@ def main(args): else: data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF) - for data_type in data_types: - llm = ff.LLM( - args.base_model_name, - data_type=data_type, - cache_path=args.cache_folder, - refresh_cache=args.refresh_cache, - ) - for peft_model_id in args.peft_model_ids: - lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id) - llm.add_peft(lora_config) - llm.download_hf_weights_if_needed() - llm.download_hf_config() - llm.download_hf_tokenizer_if_needed() + for peft_model_id in args.peft_model_ids: + hf_config = PeftConfig.from_pretrained(peft_model_id) + for data_type in data_types: + llm = ff.LLM( + hf_config.base_model_name_or_path, + data_type=data_type, + cache_path=args.cache_folder, + refresh_cache=args.refresh_cache, + ) + # Download base model config, weights and tokenizer + llm.download_hf_config() + llm.download_hf_weights_if_needed() + llm.download_hf_tokenizer_if_needed() + # Download PEFT adapter + llm.download_peft_adapter_if_needed(peft_model_id) if __name__ == "__main__": diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py index b8ed15eaea..52fe331bf3 100644 --- a/python/flexflow/core/__init__.py +++ b/python/flexflow/core/__init__.py @@ -91,7 +91,6 @@ "use_8bit_quantization": "--8bit-quantization", "enable_peft": "-enable-peft", "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size", - "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size", } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 59e62ea023..02eff0ca76 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -811,6 +811,10 @@ def pipeline_parallelism_degree(self, value): @property def python_data_loader_type(self): return ffc().flexflow_config_get_python_data_loader_type(self.handle) + + @property + def enable_peft(self): + return ffc().flexflow_config_get_enable_peft(self.handle) @property def cpu_offload(self): @@ -1629,6 +1633,11 @@ def set_max_sequence_length(self, max_length): def get_max_sequence_length(self): return ffc().flexflow_request_manager_get_max_sequence_length(self.handle) + + def set_max_concurrent_adapters(self, max_adapters): + return ffc().flexflow_request_manager_set_max_concurrent_adapters( + self.handle, max_adapters + ) def set_enable_peft_finetuning(self, enable_peft_finetuning): return ffc().flexflow_request_manager_set_enable_peft_finetuning( @@ -4288,8 +4297,12 @@ def argmax(self, input, beam_search, name=None): self.add_layer(OpType.ARGMAX, name) return Tensor(handle, owner_op_type=OpType.ARGMAX) - def add_lora_layer(self, peft_config): - return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle) + def add_lora_layers(self, target_modules: List[str]): + c_target_modules = [get_c_name(module) for module in target_modules] + return ffc().flexflow_model_add_lora_layers(self.handle, len(target_modules), c_target_modules) + + def register_peft_adapter(self, peft_config): + return ffc().flexflow_model_register_peft_adapter(self.handle, peft_config.handle) def reset_metrics(self): """Reset performance metrics. @@ -4751,6 +4764,7 @@ def generate(self, requests_list: List[Request]): finetuning_losses=finetuning_losses, ) ) + return results def set_position_offset(self, offset): ffc().flexflow_model_set_position_offset(self.handle, offset) diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index fd29080a6a..55044d1838 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -55,7 +55,6 @@ def init( use_8bit_quantization: Optional[bool] = None, enable_peft: Optional[bool] = None, peft_activation_reserve_space_size: Optional[int] = None, - peft_weight_reserve_space_size: Optional[int] = None, profiling: Optional[bool] = None, benchmarking: Optional[bool] = None, inference_debugging: Optional[bool] = None, @@ -86,7 +85,6 @@ def init( - use_8bit_quantization: whether to use 8-bit quantization, defaults to False - enable_peft: whether to enable the use of PEFT, defaults to False - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB - - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB - profiling: whether to enable the FlexFlow profiling mode, defaults to False - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False @@ -125,8 +123,6 @@ def init( :type enable_peft: Optional[bool], optional :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB :type peft_activation_reserve_space_size: Optional[int], optional - :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB - :type peft_weight_reserve_space_size: Optional[int], optional :param profiling: whether to enable the FlexFlow profiling mode, defaults to False :type profiling: Optional[bool], optional :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False @@ -158,7 +154,6 @@ def init( use_8bit_quantization is not None, enable_peft is not None, peft_activation_reserve_space_size is not None, - peft_weight_reserve_space_size is not None, profiling is not None, benchmarking is not None, inference_debugging is not None, @@ -187,7 +182,6 @@ def init( "use_8bit_quantization": use_8bit_quantization, "enable_peft": enable_peft, "peft_activation_reserve_space_size": peft_activation_reserve_space_size, - "peft_weight_reserve_space_size": peft_weight_reserve_space_size, "profiling": profiling, "benchmarking": benchmarking, "inference_debugging": inference_debugging, @@ -210,7 +204,6 @@ def init( "pipeline_parallelism_degree", "offload_reserve_space_size", "peft_activation_reserve_space_size", - "peft_weight_reserve_space_size", ] for param in positive_int_params: __check_positive_int(configs_dict, param) @@ -238,8 +231,6 @@ def init( configs_dict["enable_peft"] = False if configs_dict.get("peft_activation_reserve_space_size", None) is None: configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3 - if configs_dict.get("peft_weight_reserve_space_size", None) is None: - configs_dict["peft_weight_reserve_space_size"] = 1024**3 if configs_dict.get("profiling", None) is None: configs_dict["profiling"] = False if configs_dict.get("benchmarking", None) is None: diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py index 0c6102406f..60aa3c27e9 100644 --- a/python/flexflow/serve/models/falcon.py +++ b/python/flexflow/serve/models/falcon.py @@ -257,6 +257,10 @@ def build_model(self, max_tokens_per_batch): # output = ffmodel.arg_top_k(lm_head, 1, False) softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["dense_h_to_4h", "dense_4h_to_h"]) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py index e149834603..ceea9e96b0 100644 --- a/python/flexflow/serve/models/llama.py +++ b/python/flexflow/serve/models/llama.py @@ -264,6 +264,10 @@ def build_model(self, max_tokens_per_batch): # output = ffmodel.arg_top_k(dense, 1, False) softmax = ffmodel.softmax(dense, -1) output = ffmodel.argmax(softmax, False) + + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["gate_proj", "up_proj", "down_proj"]) self.ffmodel = ffmodel diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py index a0e70b381a..d927a1fbb3 100644 --- a/python/flexflow/serve/models/mpt.py +++ b/python/flexflow/serve/models/mpt.py @@ -258,6 +258,10 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["up_proj", "down_proj"]) + self.ffmodel = ffmodel # TODO: finish this diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py index ba2e21b690..e8d6fec9af 100644 --- a/python/flexflow/serve/models/opt.py +++ b/python/flexflow/serve/models/opt.py @@ -287,6 +287,10 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["fc1", "fc2"]) + self.ffmodel = ffmodel def convert_hf_weight_name(name): diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py index dc5faf175f..107614e9dd 100644 --- a/python/flexflow/serve/models/starcoder.py +++ b/python/flexflow/serve/models/starcoder.py @@ -228,6 +228,10 @@ def build_model(self, max_tokens_per_batch): softmax = ffmodel.softmax(lm_head, -1) output = ffmodel.argmax(softmax, False) + if self.ffconfig.enable_peft: + # TODO: add attention projections + ffmodel.add_lora_layers(["c_fc", "c_proj"]) + self.ffmodel = ffmodel def convert_hf_model(model, dst_folder): diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index e4248a2fc1..c2804b6966 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -31,9 +31,17 @@ from peft import PeftModel, PeftConfig, LoraConfig from huggingface_hub import HfApi import torch, shutil, hashlib, json, gc -from typing import Union, List +from typing import Union, List, Tuple +from safetensors import safe_open from huggingface_hub import snapshot_download +from enum import Enum + + +class CachedResourceType(Enum): + TOKENIZER = "tokenizer" + WEIGHTS = "weights" + class _SupportedModels: def __init__( @@ -104,14 +112,14 @@ def __init__( self.output_file = output_file self.rm = None self.pefts = {} - self.tokenizer=None + self.tokenizer = None def __del__(self): # Stop the background server before deleting the object if type(self) == LLM and self.rm is not None: self.rm.stop_server() - def add_peft(self, lora_config: LoraLinearConfig): + def register_peft_adapter(self, lora_config: LoraLinearConfig): """Add a PEFT adapter to the LLM""" if lora_config is None: raise ValueError("lora_config cannot be None") @@ -145,9 +153,12 @@ def add_peft(self, lora_config: LoraLinearConfig): f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}" ) + lora_config.ff_compile() + self.pefts[lora_config] = { "peft_config": peft_config, "peft_type": peft_config.peft_type, + "ff_peft_model_id": self.model.ffmodel.register_peft_adapter(lora_config), } def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID: @@ -175,34 +186,33 @@ def download_hf_config(self): os.makedirs(config_dir, exist_ok=True) print(f"Creating directory {config_dir} (if it doesn't exist)...") print(f"Saving {self.model_name} configs to file {config_path}...") - self.hf_config.to_json_file(config_path) - - # Save PEFT configs if the LLM has any registered PEFTs - for ff_peft_config, peft_dict in self.pefts.items(): - peft_config = peft_dict["peft_config"] - peft_model_id = ff_peft_config.peft_model_id - peft_config_dir = os.path.join( - os.path.expanduser(self.cache_path), "configs", peft_model_id.lower() - ) - os.makedirs(peft_config_dir, exist_ok=True) - peft_config_path = os.path.join(peft_config_dir, "config.json") - print(f"Saving {peft_model_id} configs to file {peft_config_path}...") - with open(peft_config_path, "w") as json_file: - - class SetEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, set): - return list(obj) - return super().default(obj) - - json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder) - - def __get_revision_hashes(self, model_name: str, folder: str): + # self.hf_config.to_json_file(config_path) + src_folder = snapshot_download( + repo_id=self.model_name, allow_patterns="config.json" + ) + src_path = os.path.join(src_folder, "config.json") + if os.path.exists(src_path): + shutil.copy(src_path, config_path) + + def __get_revision_hashes( + self, model_name: str, folder: str + ) -> Tuple[Union[str, None], str, str]: + """Return the commit hash of the object (weight, tokenizer, etc) cached by FlexFlow and the latest commit hash of the object from HuggingFace (or other source) + + Args: + model_name (str): Name of the model cached by FlexFlow + folder (str): Folder where the cached object is stored + + Returns: + ff_revision: Commit hash of the object cached by FlexFlow + ff_revision_filepath: Path to the file containing the commit hash of the object cached by FlexFlow + latest_revision: Latest commit hash of the object from HuggingFace (or other source) + """ ff_revision = None - ff_revision_file = os.path.join(folder, "rev_sha.txt") + ff_revision_filepath = os.path.join(folder, "rev_sha.txt") - if os.path.exists(ff_revision_file): - ff_revision = "".join(open(ff_revision_file).read().split()) + if os.path.exists(ff_revision_filepath): + ff_revision = "".join(open(ff_revision_filepath).read().split()) if os.path.exists(model_name) and os.path.isdir(model_name): # Local model @@ -215,16 +225,21 @@ def __get_revision_hashes(self, model_name: str, folder: str): # Remote HuggingFace model hf_api = HfApi() latest_revision = hf_api.model_info(self.model_name).sha - return ff_revision, ff_revision_file, latest_revision + return ff_revision, latest_revision - def download_hf_weights_if_needed(self): - """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. - If not, or if the refresh_cache parameter is set to True, download new weights. + def __get_resource_path( + self, model_name: str, resource_type: CachedResourceType + ) -> str: + """Returns the path to the folder where the model weights or tokenizer files are stored - If any PEFT adapter is registered, perform the same operation for PEFT. - """ + Args: + model_name (str): Name of the model + resource_type (CachedResourceType): Whether to get the path to the weights or the tokenizer - def get_weights_path(model_name): + Returns: + str: Path to the folder where the model weights or tokenizer files are stored + """ + if resource_type == CachedResourceType.WEIGHTS: return os.path.join( os.path.expanduser(self.cache_path), "weights", @@ -235,19 +250,49 @@ def get_weights_path(model_name): else "half-precision" ), ) + elif resource_type == CachedResourceType.TOKENIZER: + return os.path.join( + os.path.expanduser(self.cache_path), "tokenizers", model_name.lower() + ) + else: + raise ValueError(f"Invalid resource type {resource_type}") - def refresh_cache_if_needed(model_name): - weights_path = get_weights_path(model_name) - if self.refresh_cache: - print( - f"Refreshing weights in cache for model {model_name} at path {weights_path} ..." - ) - if os.path.exists(weights_path): - shutil.rmtree(weights_path) - os.makedirs(weights_path, exist_ok=True) + def __need_cache_refresh( + self, model_name: str, resource_type: CachedResourceType + ) -> bool: + """Check whether the model weights or tokenizer files are available and up to date. + If they need a refresh, create the folder for the resource, save the new commit hash to the rev_sha.txt file, delete any existing files, and return true. - def get_hf_llm(model_name): - return AutoModelForCausalLM.from_pretrained( + Args: + model_name (str): Name of the model to check + resource_type (CachedResourceType): Whether to check the weights or the tokenizer + + Returns: + bool: True if the weights or tokenizer need a refresh, False otherwise + """ + resource_path = self.__get_resource_path(model_name, resource_type) + ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path) + if self.refresh_cache or not os.path.exists(resource_path) or ff_revision != latest_revision: + print( + f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..." + ) + if os.path.exists(resource_path): + shutil.rmtree(resource_path) + os.makedirs(resource_path, exist_ok=True) + ff_revision_file = os.path.join(resource_path, "rev_sha.txt") + with open(ff_revision_file, "w+") as f: + f.write(latest_revision) + return True + return False + + def download_hf_weights_if_needed(self) -> None: + """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date. + If not, or if the refresh_cache parameter is set to True, download new weights and convert them. + """ + + # TODO: edit this to download the weights using snapshot_download and convert them to FlexFlow format without loading them to GPU + def download_and_convert_llm_weights(model_name): + hf_model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=( @@ -256,73 +301,26 @@ def get_hf_llm(model_name): else torch.float16 ), ) - - def download_llm_weights(): - refresh_cache_if_needed(self.model_name) - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, self.weights_path + # Convert the model to FlexFlow format + weights_path = self.__get_resource_path( + model_name, CachedResourceType.WEIGHTS ) - if ff_revision != latest_revision: - print( - f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." - ) - hf_model = get_hf_llm(self.model_name) - # Convert the model to FlexFlow format - self.model_class.convert_hf_model(hf_model, self.weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print(f"Done converting the weights for model {self.model_name}") - # Deallocate hf model - del hf_model - gc.collect() - torch.cuda.empty_cache() - - def convert_peft_model(hf_peft_model, peft_type, weights_path): - for name, params in hf_peft_model.named_parameters(): - if peft_type.lower() in name: - name = name.replace("base_model.model.model.", "").replace( - ".default", "" - ) - name = self.model_class.convert_hf_weight_name(name) - params.detach().cpu().numpy().tofile(f"{weights_path}/{name}") - - def download_peft_weights(): - for ff_peft_config, peft_dict in self.pefts.items(): - if not ff_peft_config.init_lora_weights: - peft_config = peft_dict["peft_config"] - peft_type = peft_dict["peft_type"] - peft_model_id = ff_peft_config.peft_model_id - - weights_path = get_weights_path(peft_model_id) - refresh_cache_if_needed(peft_model_id) - ff_revision, ff_revision_file, latest_revision = ( - self.__get_revision_hashes(peft_model_id, weights_path) - ) - - if ff_revision != latest_revision: - print( - f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..." - ) - hf_model = get_hf_llm(peft_model_id) - hf_peft_model = PeftModel.from_pretrained( - hf_model, peft_model_id, config=peft_config - ) - # Convert the model to FlexFlow format - convert_peft_model(hf_peft_model, peft_type, weights_path) - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) - print(f"Done converting the weights for model {peft_model_id}") - # Deallocate hf model - del hf_peft_model - del hf_model - gc.collect() - torch.cuda.empty_cache() - - self.weights_path = get_weights_path(self.model_name) - download_llm_weights() - download_peft_weights() + self.model_class.convert_hf_model(hf_model, weights_path) + # Save new revision hash to file + print(f"Done converting the weights for model {self.model_name}") + # Deallocate hf model + del hf_model + gc.collect() + torch.cuda.empty_cache() + + need_refresh = self.__need_cache_refresh( + self.model_name, CachedResourceType.WEIGHTS + ) + if need_refresh: + print( + f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..." + ) + download_and_convert_llm_weights(self.model_name) def download_hf_tokenizer_if_needed(self): """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date. @@ -331,25 +329,10 @@ def download_hf_tokenizer_if_needed(self): print("Loading tokenizer...") # Use local cache, or download new version - self.tokenizer_path = os.path.join( - os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower() + need_refresh = self.__need_cache_refresh( + self.model_name, CachedResourceType.TOKENIZER ) - if self.refresh_cache: - print( - f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..." - ) - if os.path.exists(self.tokenizer_path): - shutil.rmtree(self.tokenizer_path) - if not os.path.exists(self.tokenizer_path): - print(f"Creating directory {self.tokenizer_path} (if it doesn't exist)...") - os.makedirs(self.tokenizer_path, exist_ok=True) - - # Get local revision SHA, check if it matches latest one on huggingface - ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes( - self.model_name, self.tokenizer_path - ) - - if ff_revision != latest_revision: + if need_refresh: print( f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..." ) @@ -367,15 +350,76 @@ def download_hf_tokenizer_if_needed(self): hf_tokenizer_path = snapshot_download( repo_id=self.model_name, allow_patterns=target_tokenizer_files ) + tokenizer_path = self.__get_resource_path( + self.model_name, CachedResourceType.TOKENIZER + ) for file in target_tokenizer_files: src_path = os.path.join(hf_tokenizer_path, file) - dst_path = os.path.join(self.tokenizer_path, file) + dst_path = os.path.join(tokenizer_path, file) if os.path.exists(src_path): shutil.copy(src_path, dst_path) print("Done updating HF tokenizer.") - # Save new revision hash to file - with open(ff_revision_file, "w+") as f: - f.write(latest_revision) + + def download_peft_adapter_if_needed(self, hf_peft_model_id: str): + """Check in the folder specified by the cache_path whether the PEFT model weights are available and up to date. + If not, or if the refresh_cache parameter is set to True, download new weights and convert them. + """ + + def download_and_convert_peft_model(hf_peft_model_id: str): + if ( + self.data_type != DataType.DT_FLOAT + and self.data_type != DataType.DT_HALF + ): + raise ValueError( + "data_type must be either DataType.DT_FLOAT or DataType.DT_HALF" + ) + + # Save peft config to file + peft_config_dir = os.path.join( + os.path.expanduser(self.cache_path), "configs", hf_peft_model_id.lower() + ) + dst_path = os.path.join(peft_config_dir, "config.json") + os.makedirs(peft_config_dir, exist_ok=True) + print(f"Saving {hf_peft_model_id} configs to file {dst_path}...") + config_path = snapshot_download( + repo_id=hf_peft_model_id, allow_patterns="adapter_config.json" + ) + src_path = os.path.join(config_path, "adapter_config.json") + if os.path.exists(src_path): + shutil.copy(src_path, dst_path) + + # Save peft weights to file + adapter_path = snapshot_download( + repo_id=hf_peft_model_id, allow_patterns="adapter_model.safetensors" + ) + weights_path = self.__get_resource_path( + hf_peft_model_id.lower(), CachedResourceType.WEIGHTS + ) + with safe_open(adapter_path, framework="pt", device="cpu") as f: + for tensor_name in f.keys(): + tensor = f.get_tensor(tensor_name) + if self.data_type == DataType.DT_HALF: + tensor = tensor.half() + else: + tensor = tensor.float() + tensor_name = tensor_name.replace( + "base_model.model.model.", "" + ).replace(".default", "") + print(tensor_name) + + tensor_name = self.model_class.convert_hf_weight_name(tensor_name) + tensor.detach().cpu().numpy().tofile( + f"{weights_path}/{tensor_name}" + ) + + need_refresh = self.__need_cache_refresh( + hf_peft_model_id, CachedResourceType.WEIGHTS + ) + if need_refresh: + print( + f"'{hf_peft_model_id}' local model weights need updating! Downloading/converting new weights now..." + ) + download_and_convert_peft_model(hf_peft_model_id) def compile( self, @@ -383,10 +427,8 @@ def compile( max_requests_per_batch: int = 1, max_seq_length: int = 256, max_tokens_per_batch: int = 64, + max_concurrent_adapters: int = 1, enable_peft_finetuning: bool = False, - model_specific_data_parallelism_degree: int = None, - model_specific_tensor_parallelism_degree: int = None, - model_specific_pipeline_parallelism_degree: int = None, ssms: list = [], ): """Compile the LLM for inference and load the weights into memory @@ -399,14 +441,10 @@ def compile( :type max_seq_length: int, optional :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64 :type max_tokens_per_batch: int, optional + :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1 + :type max_concurrent_adapters: int, optional :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False :type enable_peft_finetuning: bool, optional - :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None - :type model_specific_data_parallelism_degree: int, optional - :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None - :type model_specific_tensor_parallelism_degree: int, optional - :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the LLM a different pipeline parallelism degree than the one used to initialize the runtime, defaults to None - :type model_specific_pipeline_parallelism_degree: int, optional :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ @@ -418,24 +456,13 @@ def compile( mode = InferenceMode.TREE_VERIFY_MODE elif type(self) == SSM: mode = InferenceMode.BEAM_SEARCH_MODE + self.ffconfig.data_parallelism_degree = 1 + self.ffconfig.tensor_parallelism_degree = 1 + self.ffconfig.pipeline_parallelism_degree = 1 else: assert type(self) == LLM mode = InferenceMode.INC_DECODING_MODE - # Apply model-specific parallelism degrees, if needed - if model_specific_data_parallelism_degree: - self.ffconfig.data_parallelism_degree = ( - model_specific_data_parallelism_degree - ) - if model_specific_tensor_parallelism_degree: - self.ffconfig.tensor_parallelism_degree = ( - model_specific_tensor_parallelism_degree - ) - if model_specific_pipeline_parallelism_degree: - self.ffconfig.pipeline_parallelism_degree = ( - model_specific_pipeline_parallelism_degree - ) - self.max_seq_length = max_seq_length # Create request manager and set serving configuration @@ -443,6 +470,7 @@ def compile( self.rm.set_max_requests_per_batch(max_requests_per_batch) self.rm.set_max_tokens_per_batch(max_tokens_per_batch) self.rm.set_max_sequence_length(max_seq_length) + self.rm.set_max_concurrent_adapters(max_concurrent_adapters) self.rm.set_enable_peft_finetuning(enable_peft_finetuning) # Instantiate the relevant model @@ -464,12 +492,6 @@ def compile( # Download the weights from huggingface (if needed) self.download_hf_weights_if_needed() - # Add PEFT layer if registered - for ff_peft_config, peft_dict in self.pefts.items(): - ff_peft_config.ff_compile() - ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config) - peft_dict["ff_peft_model_id"] = ff_peft_model_id - # Create file data loader, load weights into tensors model_configs = self.config_class(self.hf_config) @@ -479,8 +501,11 @@ def compile( else 20 ) + weights_path = self.__get_resource_path( + self.model_name, CachedResourceType.WEIGHTS + ) self.fileloader = FileDataLoader( - self.weights_path, + weights_path, model_configs.num_attention_heads, model_configs.num_key_value_heads, model_configs.hidden_size, @@ -504,8 +529,11 @@ def compile( eos_token_id = [eos_token_id] elif type(eos_token_id) != list: raise ValueError("eos_token_id must be an integer or a list of integers") + tokenizer_path = self.__get_resource_path( + self.model_name, CachedResourceType.TOKENIZER + ) self.rm.register_tokenizer( - self.model_type, bos_token_id, eos_token_id, self.tokenizer_path + self.model_type, bos_token_id, eos_token_id, tokenizer_path ) self.rm.register_output_filepath(self.output_file) @@ -520,14 +548,14 @@ def compile( atexit.register(self.rm.stop_server) - def _generate(self, requests: List[Request]): + def _generate(self, requests: List[Request]) -> List[GenerationResult]: if len(requests) == 0: return [] for req in requests: if req.req_type == RequestType.REQ_INFERENCE: # check max_length and max_new_tokens parameters if req.max_length == -1 and req.max_new_tokens == -1: - req.max_length = self.max_seq_length -1 + req.max_length = self.max_seq_length - 1 elif req.max_length != -1 and req.max_new_tokens != -1: warnings.warn( f"Both `max_new_tokens` (={req.max_new_tokens}) and `max_length`(={req.max_length}) seem to have been set. `max_new_tokens` will take precedence." @@ -546,14 +574,14 @@ def _generate(self, requests: List[Request]): f"max_new_tokens ({req.max_new_tokens}) is not allowed for finetuning requests." ) if req.max_length == -1: - req.max_length = self.max_seq_length -1 + req.max_length = self.max_seq_length - 1 if req.max_length >= self.max_seq_length: raise ValueError( f"max_length ({req.max_length}) exceeds the maximum sequence length ({self.max_seq_length})" ) return self.model.ffmodel.generate(requests) - def __chat2prompt(self, messages: List[dict]): + def __chat2prompt(self, messages: List[dict]) -> str: """Convert a list of messages to a single prompt string :param messages: The list of messages to convert @@ -563,15 +591,31 @@ def __chat2prompt(self, messages: List[dict]): """ # ensure that each element is a dictionary, containing the "role" and "content" keys for message in messages: - if type(message) != dict or "role" not in message or "content" not in message: + if ( + type(message) != dict + or "role" not in message + or "content" not in message + ): raise ValueError( "Each element in the list must be a dictionary with the keys 'role' and 'content'" ) if self.tokenizer is None: self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) if self.tokenizer.chat_template is None: - raise ValueError(f"Model {self.model_name} does not support chat completion") - return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + raise ValueError( + f"Model {self.model_name} does not support chat completion" + ) + return self.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + def __output2chat_response( + self, requests: List[Request], outputs: List[GenerationResult] + ) -> List[GenerationResult]: + assert len(requests) == len(outputs) + for i in range(len(outputs)): + outputs[i].output_text = outputs[i].output_text[len(requests[i].prompt) :] + return outputs def generate( self, @@ -625,9 +669,12 @@ def generate( max_new_tokens=max_new_tokens, add_special_tokens=False, ) - return self._generate([request]) + outputs = self._generate([request]) + return self.__output2chat_response([request], outputs) elif type(requests_or_prompts[0]) == list: - prompts = [self.__chat2prompt(messages) for messages in requests_or_prompts] + prompts = [ + self.__chat2prompt(messages) for messages in requests_or_prompts + ] requests = [ Request( req_type=RequestType.REQ_INFERENCE, @@ -638,12 +685,15 @@ def generate( ) for prompt in prompts ] - return self._generate(requests) + outputs = self._generate(requests) + return self.__output2chat_response(requests, outputs) elif type(requests_or_prompts[0]) == Request: print(requests_or_prompts) return self._generate(requests_or_prompts) else: - assert False, "Please pass a string, list of strings, Request, or list of Requests" + assert ( + False + ), "Please pass a string, list of strings, Request, or list of Requests" def start_server(self): self.rm.start_server(self.model.ffmodel) @@ -685,11 +735,9 @@ def compile( generation_config: GenerationConfig = GenerationConfig(), max_requests_per_batch: int = 16, max_seq_length: int = 256, - max_tokens_per_batch: int = 128, + max_tokens_per_batch: int = 2048, + max_concurrent_adapters: int = 1, enable_peft_finetuning: bool = False, - model_specific_data_parallelism_degree: int = 1, - model_specific_tensor_parallelism_degree: int = 1, - model_specific_pipeline_parallelism_degree: int = 1, ssms: list = [], ): """Compile the SSM for inference and load the weights into memory @@ -699,16 +747,12 @@ def compile( :type max_requests_per_batch: int, optional :param max_seq_length: The maximum sequence length to allow per batch, defaults to 256 :type max_seq_length: int, optional - :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128 + :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 2048 :type max_tokens_per_batch: int, optional + :param max_concurrent_adapters: The maximum number of concurrent LoRA adapters, defaults to 1 + :type max_concurrent_adapters: int, optional :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False :type enable_peft_finetuning: bool, optional - :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1 - :type model_specific_data_parallelism_degree: int, optional - :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1 - :type model_specific_tensor_parallelism_degree: int, optional - :param model_specific_pipeline_parallelism_degree: Use this parameter if you want to give the SSM a different pipeline parallelism degree than the default one, defaults to 1 - :type model_specific_pipeline_parallelism_degree: int, optional :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] :type ssms: list, optional """ @@ -717,9 +761,7 @@ def compile( max_requests_per_batch, max_seq_length, max_tokens_per_batch, + max_concurrent_adapters, enable_peft_finetuning, - model_specific_data_parallelism_degree, - model_specific_tensor_parallelism_degree, - model_specific_pipeline_parallelism_degree, ssms, ) diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index da90c586e3..e16b0e87bd 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -177,6 +177,11 @@ void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_, handle->pipeline_parallelism_degree = value; } +bool flexflow_config_get_enable_peft(flexflow_config_t handle_) { + FFConfig *handle = FFCObjectWrapper::unwrap(handle_); + return handle->enable_peft; +} + int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) { FFConfig *handle = FFCObjectWrapper::unwrap(handle_); return handle->python_data_loader_type; @@ -1608,18 +1613,33 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_, } #ifdef FF_BUILD_INFERENCE -flexflow_peft_model_id_t flexflow_model_add_lora_layer( +void flexflow_model_add_lora_layers(flexflow_model_t handle_, + int num_target_modules, + char const **target_modules_) { + FFModel *handle = FFCObjectWrapper::unwrap(handle_); + std::vector target_modules; + for (int i = 0; i < num_target_modules; i++) { + target_modules.push_back(target_modules_[i]); + } + DEBUG_PRINT("[Add Lora Layers] model handle: %p, num_target_modules %d", + handle, + num_target_modules); + handle->add_lora_layers(target_modules); +} + +flexflow_peft_model_id_t flexflow_model_register_peft_adapter( flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_) { FFModel *handle = FFCObjectWrapper::unwrap(handle_); LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_); - PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config); + PEFTModelID *peft_model_id = handle->register_peft_adapter(*peft_config); - DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, " - "peft_model_id: %p", - handle, - peft_config, - peft_model_id); + DEBUG_PRINT( + "[Register PEFT Adapter] model handle: %p, peft_config handle %p, " + "peft_model_id: %p", + handle, + peft_config, + peft_model_id); return FFCObjectWrapper::wrap(peft_model_id); } #endif @@ -2765,6 +2785,14 @@ int flexflow_request_manager_get_max_sequence_length( return handle->get_max_sequence_length(); } +void flexflow_request_manager_set_max_concurrent_adapters( + flexflow_request_manager_t handle_, int max_concurrent_adapters) { + RequestManager *handle = FFCObjectWrapper::unwrap(handle_); + handle->set_max_concurrent_adapters(max_concurrent_adapters); + DEBUG_PRINT("[RequestManager] set max_concurrent_adapters %d", + max_concurrent_adapters); +} + void flexflow_request_manager_set_enable_peft_finetuning( flexflow_request_manager_t handle_, bool enable_peft_finetuning_) { RequestManager *handle = FFCObjectWrapper::unwrap(handle_); @@ -2909,7 +2937,9 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_, flexflow_model_t model_handle_) { FileDataLoader *handle = FFCObjectWrapper::unwrap(handle_); FFModel *model = FFCObjectWrapper::unwrap(model_handle_); - handle->load_weights(model); + Context ctx = model->config.lg_ctx; + Runtime *runtime = model->config.lg_hlr; + handle->load_weights_parallel(model, ctx, runtime); } // // ----------------------------------------------------------------------- diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index d7b9a5e99d..c02f70f752 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -288,6 +288,10 @@ void FFMapper::select_task_options(const MapperContext ctx, output.initial_proc = all_cpus[0]; return; } + if (task.task_id == LOAD_WEIGHT_TASK_ID) { + output.initial_proc = all_cpus[0]; + return; + } if (task.task_id == TOP_LEVEL_TASK_ID) { output.initial_proc = all_cpus[0]; // control replicate top level task diff --git a/src/ops/fused.cu b/src/ops/fused.cu index 62845c0f8e..8635fd6a87 100644 --- a/src/ops/fused.cu +++ b/src/ops/fused.cu @@ -862,6 +862,7 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, int num_infr_tokens = bc->num_active_infr_tokens(); int num_peft_tokens = bc->num_active_peft_tokens(); Kernels::Linear::peft_bwd_kernel_wrapper(m, + bc, my_input_grad_accessor[0].ptr, my_output_grad_accessor[0].ptr, my_weight_accessor[0].ptr, @@ -889,11 +890,13 @@ __host__ void FusedOp::peft_bwd_task(Task const *task, // Assert that the output and the second input are at the same place // since we ``inplace'' the output for LoRA assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr); + int shard_id = task->index_point.point_data[0]; Kernels::LoraLinear::peft_bwd_kernel_wrapper( ctx, runtime, m, bc, + shard_id, my_input_grad_accessor[0], my_output_grad_accessor[0]); break; diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 3832428c64..51954597d7 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -16,6 +16,7 @@ #include "flexflow/ffconst_utils.h" #include "flexflow/ops/kernels/decompress_kernels.h" #include "flexflow/ops/kernels/linear_kernels.h" +#include "flexflow/ops/lora_linear_params.h" #include "flexflow/utils/cuda_helper.h" namespace FlexFlow { @@ -73,6 +74,17 @@ LinearMeta::~LinearMeta(void) { } } +bool lora_applies_to_this_layer(LinearMeta const *m, + LoraLinearConfig const &config) { + for (std::string s : config.target_modules) { + std::string n(m->op_name); + if (n.find(s) != std::string::npos) { + return true; + } + } + return false; +} + namespace Kernels { namespace Linear { @@ -285,6 +297,7 @@ void inference_kernel_wrapper(LinearMeta *m, } void peft_bwd_kernel_wrapper(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *weight_ptr, @@ -302,6 +315,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, } if (m->input_type[0] == DT_FLOAT) { Internal::peft_bwd_kernel(m, + bc, input_grad_ptr, output_grad_ptr, weight_ptr, @@ -312,6 +326,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, stream); } else if (m->input_type[0] == DT_HALF) { Internal::peft_bwd_kernel(m, + bc, input_grad_ptr, output_grad_ptr, weight_ptr, @@ -568,6 +583,7 @@ void forward_kernel(LinearMeta const *m, template void peft_bwd_kernel(LinearMeta const *m, + BatchConfig const *bc, void *input_grad_ptr, void *output_grad_ptr, void const *kernel_ptr, @@ -611,6 +627,35 @@ void peft_bwd_kernel(LinearMeta const *m, // NOTE: we use beta=1 for input_grad to accumulate gradients when needed DT alpha = 1.0f; DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + + // ensure that we only have one finetuning request, with a single lora + int num_peft_requests = 0; + bool lora_applies = false; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { + continue; + } + num_peft_requests++; + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + lora_applies = true; + } + assert(num_peft_requests == 1 && + "Exactly one PEFT finetuning request is required"); + // if the request does not have any active lora in the current layer, reset + // beta to 0 std::cout << m->op_name << " original beta: " << (float)beta << " + // lora_applies: " << lora_applies << std::endl; + if (lora_applies) { + beta = 1.0f; + } + if (input_grad_ptr != NULL) { checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 638cee8cae..40095484b5 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -23,29 +23,32 @@ namespace FlexFlow { LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li) - : OpMeta(handler, li) { - allocated_peft_buffer_size1 = 0; - allocated_peft_buffer_size2 = 0; -} + : OpMeta(handler, li) {} LoraLinearMeta::~LoraLinearMeta(void) {} -namespace Kernels { -namespace LoraLinear { - -void init_kernel_wrapper(LoraLinearMeta *m, int seed) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - - if (m->input_type[0] == DT_FLOAT) { - Internal::init_kernel(m, seed, stream); - } else if (m->input_type[0] == DT_HALF) { - Internal::init_kernel(m, seed, stream); +std::string + get_peft_dbg_folder(LoraLinearMeta const *m, int shard_id, bool is_fwd) { + std::string op_name_without_uid = LoraLinear::get_op_name_without_uid(m); + fs::path dst_filepath; + if (is_fwd) { + dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id); } else { - assert(false && "Unsupported data type"); + dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id); } + if (m->layer_guid.model_id > 0) { + assert(false && "Model ID > 0 not supported yet"); + } + std::string layername = "layers." + + std::to_string(m->layer_guid.transformer_layer_id) + + "." + op_name_without_uid; + dst_filepath /= layername; + return dst_filepath.string(); } +namespace Kernels { +namespace LoraLinear { + void inference_kernel_wrapper(LoraLinearMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input, @@ -100,6 +103,7 @@ void peft_bwd_kernel_wrapper(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output_grad) { cudaStream_t stream; @@ -117,6 +121,7 @@ void peft_bwd_kernel_wrapper(Context ctx, runtime, m, bc, + shard_id, input_grad.get_float_ptr(), output_grad.get_float_ptr(), in_dim, @@ -127,6 +132,7 @@ void peft_bwd_kernel_wrapper(Context ctx, runtime, m, bc, + shard_id, input_grad.get_half_ptr(), output_grad.get_half_ptr(), in_dim, @@ -151,58 +157,19 @@ void peft_bwd_kernel_wrapper(Context ctx, } } -namespace Internal { - -template -void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) { - // Initialize generator - std::mt19937 gen(seed); - - // Get handle to weights by iterating over m->model_state to get each - // LoraLinearWeight object - for (auto &model_state : m->model_state) { - LoraLinearWeight weight = model_state.second.weights; - int w0_num_elements = weight.rank * weight.in_dim; - int w1_num_elements = weight.rank * weight.out_dim; - - // LoRA_A weight: [in_dim, rank] - float stdv_lora_a = 1.0f / sqrt(weight.in_dim); - std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); - std::vector
lora_a_random_init(w0_num_elements); - for (auto &num : lora_a_random_init) { - float num_float = dis_lora_a(gen); - if (std::is_same::value) { - num = __float2half(num_float); - } else { - num = num_float; - } - } - checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), - lora_a_random_init.data(), - w0_num_elements * sizeof(DT), - cudaMemcpyHostToDevice, - stream)); - - // LoRA_B weight: [rank, out_dim] - float stdv_lora_b = 1.0f / sqrt(weight.rank); - std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); - std::vector lora_b_random_init(w1_num_elements); - for (auto &num : lora_b_random_init) { - float num_float = dis_lora_b(gen); - if (std::is_same::value) { - num = __float2half(num_float); - } else { - num = num_float; - } +bool lora_applies_to_this_layer(LoraLinearMeta *m, + LoraLinearConfig const &config) { + for (std::string s : config.target_modules) { + std::string n(m->op_name); + if (n.find(s) != std::string::npos) { + return true; } - checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), - lora_b_random_init.data(), - w1_num_elements * sizeof(DT), - cudaMemcpyHostToDevice, - stream)); } + return false; } +namespace Internal { + template void inference_kernel(LoraLinearMeta *m, BatchConfig const *bc, @@ -213,91 +180,60 @@ void inference_kernel(LoraLinearMeta *m, ffStream_t stream) { checkCUDA(cublasSetStream(m->handle.blas, stream)); checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); - DT alpha = 1.0f, beta = 0.0f; cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]); cudaDataType_t lr_actv_type = output_type; assert(input_type == output_type); cudaDataType_t weight_type = output_type; cudaDataType_t compute_type = output_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = output_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->input_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif + int num_peft_requests = 0; for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { continue; } if (bc->requestsInfo[i].peft_bwd) { num_peft_requests++; } - } - // Assert that we have at most one request that requires peft_bwd - assert(num_peft_requests <= 1); - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // Skip non-PEFT requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { continue; } + // std::cout << "Lora layer activated!" << std::endl; + // std::cout << "Lora Config: " << peft_model_config_str << std::endl; + assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && + "Trainable flag mismatch"); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - int max_peft_tokens = bc->requestsInfo[i].max_length; + // int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != - m->model_state.end()); - LoraLinearWeight weight = - m->model_state[bc->requestsInfo[i].peft_model_id].weights; - int rank = weight.rank; - void *intermediate_result_ptr = nullptr; + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); + void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd) + ? weight.low_rank_activation + : m->handle.workSpace; if (bc->requestsInfo[i].peft_bwd) { - size_t activation_size_needed1 = - data_type_size(m->input_type[0]) * max_peft_tokens * in_dim; - size_t activation_size_needed2 = - data_type_size(m->input_type[1]) * max_peft_tokens * rank; - MemoryAllocator *allocator = m->handle.peft_activation_allocator; - if (activation_size_needed1 > m->allocated_peft_buffer_size1) { - m->input_activation = - allocator->allocate_instance_untyped(activation_size_needed1); - m->allocated_peft_buffer_size1 = activation_size_needed1; - } - if (activation_size_needed2 > m->allocated_peft_buffer_size2) { - m->low_rank_activation = - allocator->allocate_instance_untyped(activation_size_needed2); - m->allocated_peft_buffer_size2 = activation_size_needed2; - } - // copy input activation - checkCUDA(cudaMemcpyAsync(m->input_activation, + checkCUDA(cudaMemcpyAsync(weight.input_activation, input_ptr + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, cudaMemcpyDeviceToDevice, stream)); - intermediate_result_ptr = m->low_rank_activation; } else { // use workspace to save intermediate result - assert(m->handle.workSpaceSize >= - data_type_size(m->input_type[1]) * num_peft_tokens * rank); - intermediate_result_ptr = m->handle.workSpace; + assert(m->handle.workSpaceSize >= data_type_size(m->input_type[1]) * + num_peft_tokens * lora_config.rank); } + DT alpha = 1.0f, beta = 0.0f; // buffer = weight_first * input // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens] checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, - rank, + lora_config.rank, num_peft_tokens, in_dim, &alpha, @@ -310,29 +246,27 @@ void inference_kernel(LoraLinearMeta *m, &beta, intermediate_result_ptr, lr_actv_type, - rank, + lora_config.rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); // output = weight_second * buffer // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens] // Note that we use alpha in both places since we do // an in-place update for LoraLinear - float lora_alpha = - m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; - DT scaling_constant = (DT)(lora_alpha / rank); + DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_T, CUBLAS_OP_N, out_dim, num_peft_tokens, - rank, + lora_config.rank, &scaling_constant, weight.w1_ptr, weight_type, - rank, + lora_config.rank, intermediate_result_ptr, lr_actv_type, - rank, + lora_config.rank, &alpha, output_ptr + first_token_offset * out_dim, output_type, @@ -340,6 +274,7 @@ void inference_kernel(LoraLinearMeta *m, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } + assert(num_peft_requests <= 1); } template @@ -371,6 +306,7 @@ void peft_bwd_kernel(Context ctx, Runtime *runtime, LoraLinearMeta *m, BatchConfig const *bc, + int shard_id, DT *input_grad_ptr, DT const *output_grad_ptr, int in_dim, @@ -384,39 +320,33 @@ void peft_bwd_kernel(Context ctx, cudaDataType_t weight_type = output_type; cudaDataType_t lr_actv_type = output_type; cudaDataType_t compute_type = output_type; - // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) - // cudaDataType_t compute_type = output_type; - // #else - // // For best performance, set the default cublas compute type to - // // CUBLAS_COMPUTE_16F for half precision and to - // // CUBLAS_COMPUTE_32F_FAST_16F for full precision - // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; - // if (m->output_type[0] == DT_FLOAT) { - // compute_type = CUBLAS_COMPUTE_32F_FAST_16F; - // } - // #endif + for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // Skip non-PEFT requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + // Skip completed, non-PEFT and PEFT forward-only requests + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { continue; } - // Skip PEFT forward-only requests - if (!bc->requestsInfo[i].peft_bwd) { + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { continue; } + // std::cout << "Lora layer activated!" << std::endl; + // std::cout << "Lora Config: " << peft_model_config_str << std::endl; + assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && + "Trainable flag mismatch"); + m->peft_memory_manager->check_ft_model_id( + bc->requestsInfo[i].peft_model_id); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + // int max_peft_tokens = bc->requestsInfo[i].max_length; // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != - m->model_state.end()); - LoraLinearWeight weight = - m->model_state[bc->requestsInfo[i].peft_model_id].weights; - int rank = weight.rank; - float lora_alpha = - m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha; - DT scaling_constant = (DT)(lora_alpha / rank); + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); + DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); // Compute LORA_B weight's gradient if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { @@ -424,23 +354,35 @@ void peft_bwd_kernel(Context ctx, DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) ? 0.0f : 1.0f; + // std::cout << "Lora B gradient computation, beta = " << (float) beta << + // std::endl; + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation"; + std::cout << "Save low_rank_activation (" << lora_config.rank << ", " + << num_peft_tokens << ") to " << filename << std::endl; + save_tensor(static_cast(weight.low_rank_activation), + lora_config.rank * num_peft_tokens, + filename.c_str()); + } checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_T, - rank, + lora_config.rank, out_dim, num_peft_tokens, &scaling_constant, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, output_grad_ptr, output_type, out_dim, &beta, weight.w1_grad_ptr, weight_type, - rank, + lora_config.rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } @@ -452,20 +394,20 @@ void peft_bwd_kernel(Context ctx, checkCUDA(cublasGemmEx(m->handle.blas, CUBLAS_OP_N, CUBLAS_OP_N, - rank, + lora_config.rank, num_peft_tokens, out_dim, &scaling_constant, weight.w1_ptr, weight_type, - rank, + lora_config.rank, output_grad_ptr, output_type, out_dim, &beta, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } @@ -480,15 +422,15 @@ void peft_bwd_kernel(Context ctx, CUBLAS_OP_N, CUBLAS_OP_T, in_dim, - rank, + lora_config.rank, num_peft_tokens, &alpha, - m->input_activation, + weight.input_activation, input_type, in_dim, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, &beta, weight.w0_grad_ptr, weight_type, @@ -506,14 +448,14 @@ void peft_bwd_kernel(Context ctx, CUBLAS_OP_N, in_dim, num_peft_tokens, - rank, + lora_config.rank, &alpha, weight.w0_ptr, weight_type, in_dim, - m->low_rank_activation, + weight.low_rank_activation, lr_actv_type, - rank, + lora_config.rank, &beta, input_grad_ptr, input_type, @@ -523,17 +465,16 @@ void peft_bwd_kernel(Context ctx, } if (bc->requestsInfo[i].optimizer_tasks.update_weights) { - LoraOptimizerConfig const *optimizer_config = - m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config; - assert(optimizer_config != nullptr); - assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig)); - int w0_num_elements = rank * in_dim; - int w1_num_elements = rank * out_dim; + assert(lora_config.optimizer_config != nullptr); + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; // Get optimizer config - if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) { + + if (lora_config.optimizer_config->getType() == "SGD") { LoraSGDOptimizerConfig const *sgd_config = - (LoraSGDOptimizerConfig const *)optimizer_config; + static_cast( + lora_config.optimizer_config); // LoRA_A weight is split in tensor parallelism, so no need to apply // all-reduce sgd_update<<(weight.w1_grad_ptr), static_cast
(weight.w1_v_values_ptr), static_cast
(weight.w1_ptr)); - } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) { + } else if (lora_config.optimizer_config->getType() == "Adam") { assert(false && "Adam optimizer type not implemented yet"); } else { assert(false && "Unsupported optimizer type"); diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 09170d3c28..8c2120e283 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -769,6 +769,7 @@ void Linear::peft_bwd_task(Task const *task, num_peft_tokens); } peft_bwd_kernel_wrapper(m, + bc, input_grad.ptr, output_grad.ptr, weight.ptr, diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 3749cce994..68605160a5 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -3,6 +3,7 @@ #include "flexflow/layer.h" #include "flexflow/model.h" #include "flexflow/ops/kernels/lora_linear_kernels.h" +#include "flexflow/request_manager.h" #include "flexflow/utils/hash_utils.h" #include "flexflow/utils/peft_weight_allocator.h" #include "legion/legion_utilities.h" @@ -51,18 +52,18 @@ bool check_lora_layer_match(Layer *potential_target, return false; } -PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { +void FFModel::add_lora_layers(std::vector target_modules) { assert(config.enable_peft && "Cannot add a LoRA layer if PEFT mode is not enabled"); - if (peft_config.target_modules.size() == 0) { - printf("PEFT config does not contain any target module\n"); - std::cout << peft_config << std::endl; - assert(false); - } - PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); - peft_configs[*peft_model_id] = peft_config; - - for (std::string target_module_name : peft_config.target_modules) { + assert(target_modules.size() > 0 && "LoRA target module name is empty"); + RequestManager *rm = RequestManager::get_request_manager(); + int max_lora_rank = rm->get_max_lora_rank(); + int max_concurrent_adapters = rm->get_max_concurrent_adapters(); + assert(max_lora_rank > 1 && max_lora_rank <= 32 && "Invalid max LoRA rank"); + assert(max_concurrent_adapters > 0 && + "Invalid number of LoRA concurrent adapters"); + + for (std::string target_module_name : target_modules) { assert(target_module_name.length() > 0 && "LoRA target module name is empty"); // find target layer @@ -72,127 +73,84 @@ PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) { if (!match) { continue; } - - if (base_layer_to_peft_layer.find(target_module) != - base_layer_to_peft_layer.end()) { - // lora linear layer already added, no need to add again - Layer *peft_layer = base_layer_to_peft_layer[target_module]; - peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); - } else { - Tensor const input = target_module->inputs[0]; - Tensor const output = target_module->outputs[0]; - assert(input->data_type == output->data_type); - std::string name_ = target_module->name - ? std::string(target_module->name) - : std::string(""); - size_t last_underscore = name_.length() - 1; - for (int i = name_.length() - 1; i > 0; i--) { - if (!(std::isdigit(target_module->name[i]) || - target_module->name[i] == '_')) { - break; - } else if (target_module->name[i] == '_') { - last_underscore = i; - } + assert(base_layer_to_peft_layer.find(target_module) == + base_layer_to_peft_layer.end() && + "LoRA layer already added, attempting to add again"); + // Get input and output tensors from target module + Tensor const input = target_module->inputs[0]; + Tensor const output = target_module->outputs[0]; + assert(input->data_type == output->data_type); + // Compute OP_LORA layer name, based on target module name + std::string name_ = target_module->name ? std::string(target_module->name) + : std::string(""); + size_t last_underscore = name_.length() - 1; + for (int i = name_.length() - 1; i > 0; i--) { + if (!(std::isdigit(target_module->name[i]) || + target_module->name[i] == '_')) { + break; + } else if (target_module->name[i] == '_') { + last_underscore = i; } - name_.erase(last_underscore); - - name_ += ".lora"; - std::cout << "Adding layer " << name_ << std::endl; - Layer *peft_layer = new Layer(this, - OP_LORA, - output->data_type, - name_.c_str(), - 2 /*inputs*/, - 0 /*weights*/, - 1 /*outputs*/, - input, - output); - // fix LoRA layer's transformer layer ID and model ID - peft_layer->layer_guid.transformer_layer_id = - target_module->layer_guid.transformer_layer_id; - peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; - { - int numdims = output->num_dims; - int dims[MAX_TENSOR_DIM]; - for (int i = 0; i < numdims; i++) { - dims[i] = output->dims[i]; - } - peft_layer->outputs[0] = - create_tensor_legion_ordering(numdims, - dims, - output->data_type, - peft_layer, - 0, - true /*create_grad*/); + } + name_.erase(last_underscore); + name_ += ".lora"; + std::cout << "Adding layer " << name_ << std::endl; + // Create OP_LORA layer given input, output and name + Layer *peft_layer = new Layer(this, + OP_LORA, + output->data_type, + name_.c_str(), + 2 /*inputs*/, + 0 /*weights*/, + 1 /*outputs*/, + input, + output); + // fix LoRA layer's transformer layer ID and model ID (to be the same as + // target module) + peft_layer->layer_guid.transformer_layer_id = + target_module->layer_guid.transformer_layer_id; + peft_layer->layer_guid.model_id = target_module->layer_guid.model_id; + // set up output tensor for OP_LORA layer + { + int numdims = output->num_dims; + int dims[MAX_TENSOR_DIM]; + for (int i = 0; i < numdims; i++) { + dims[i] = output->dims[i]; } - it = layers.insert(it + 1, peft_layer); - ++it; - base_layer_to_peft_layer[target_module] = peft_layer; - peft_layer_to_peft_id[peft_layer] = std::vector(); - peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id); + peft_layer->outputs[0] = + create_tensor_legion_ordering(numdims, + dims, + output->data_type, + peft_layer, + 0, + true /*create_grad*/); } + // pass max_rank and max_concurrent_adapters to OP_LORA layer + peft_layer->add_int_property("max_rank", max_lora_rank); + peft_layer->add_int_property("max_concurrent_adapters", + max_concurrent_adapters); + it = layers.insert(it + 1, peft_layer); + ++it; + base_layer_to_peft_layer[target_module] = peft_layer; } } - - // save finetuned lora model configs to file - if (peft_config.trainable) { - std::string finetuned_model_folder = join_path({ - peft_config.cache_folder, - "finetuned_models", - peft_config.peft_model_id, - }); - fs::remove_all(finetuned_model_folder); - std::string finetuned_model_config_folder = join_path({ - finetuned_model_folder, - "config", - }); - fs::create_directories(finetuned_model_config_folder); - std::string lora_linear_config_filepath = join_path({ - finetuned_model_config_folder, - "ff_config.json", - }); - serialize_to_json_file(peft_config, lora_linear_config_filepath); - std::string optimizer_config_filepath = join_path({ - finetuned_model_config_folder, - "ff_optimizer_config.json", - }); - if (typeid(*peft_config.optimizer_config) == - typeid(LoraSGDOptimizerConfig)) { - LoraSGDOptimizerConfig const *sgd_config = - static_cast( - peft_config.optimizer_config); - serialize_to_json_file(*sgd_config, optimizer_config_filepath); - } else if (typeid(*peft_config.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - LoraAdamOptimizerConfig const *adam_config = - static_cast( - peft_config.optimizer_config); - serialize_to_json_file(*adam_config, optimizer_config_filepath); - } else { - assert(false && "Optimizer not supported"); - } - } - - return peft_model_id; } Op *LoraLinear::create_operator_from_layer( FFModel &model, Layer const *layer, std::vector const &inputs) { - std::unordered_map _peft_configs; - std::vector const &peft_ids = - model.peft_layer_to_peft_id[(Layer *)layer]; - for (int i = 0; i < peft_ids.size(); i++) { - _peft_configs.emplace( - std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]])); - } + long long value; + layer->get_int_property("max_rank", value); + int max_rank = value; + layer->get_int_property("max_concurrent_adapters", value); + int max_concurrent_adapters = value; return new LoraLinear(model, layer->layer_guid, - layer->op_type, inputs[0], inputs[1], - _peft_configs, + max_rank, + max_concurrent_adapters, layer->name); } @@ -202,10 +160,10 @@ LoraLinear::LoraLinear(FFModel &model, ParallelTensor const output) : LoraLinear(model, other.layer_guid, - other.op_type, input, output, - other.peft_configs, + other.max_rank, + other.max_concurrent_adapters, other.name) {} LoraLinear::LoraLinear(FFModel &model, @@ -214,22 +172,23 @@ LoraLinear::LoraLinear(FFModel &model, char const *name) : LoraLinear(model, params.layer_guid, - params.type, inputs.first, inputs.second, - params.peft_configs, + params.max_rank, + params.max_concurrent_adapters, params.name) {} LoraLinear::LoraLinear( FFModel &model, LayerID const &_layer_guid, - OperatorType _op_type, ParallelTensor const _input, ParallelTensor const _output, - std::unordered_map const &_peft_configs, + int _max_rank, + int _max_concurrent_adapters, + // std::unordered_map const &_peft_configs, char const *name) : Op(model, - _op_type, + OP_LORA, _output->data_type, name, 2 /*inputs*/, @@ -256,9 +215,11 @@ LoraLinear::LoraLinear( outputs[0] = model.create_parallel_tensor_legion_ordering( numdim, dims, inputs[1]->data_type, this); } - for (auto const &kv : _peft_configs) { - peft_configs.insert(kv); - } + // for (auto const &kv : _peft_configs) { + // peft_configs.insert(kv); + // } + max_rank = _max_rank; + max_concurrent_adapters = _max_concurrent_adapters; // assert(check_output_input_weight_parallel_dims(allocate_weights)); } @@ -313,56 +274,6 @@ void LoraLinear::init_inference( set_opmeta_from_futuremap_inference(ff, fm, output_tensor); } -template -void load_peft_from_file(DT *ptr, - size_t num_rows, - size_t num_columns, - int num_shards, - int shard_id, - std::string filepath) { - std::ifstream in(filepath, std::ios::in | std::ios::binary); - if (!in.good()) { - printf("Could not open file: %s\n", filepath.c_str()); - } - assert(in.good() && "incorrect weight file path"); - - // HuggingFace dims (serialized in row-major order) - // lora_A: [rank, intermediate_dim] - // lora_B: [hidden_dim, rank] - // FlexFlow dims (serialized in column-major order) - // lora_A: [intermediate_dim, rank] - // lora_B: [rank, out_dim] - // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B - assert(num_rows % num_shards == 0); - size_t chunk_size = num_rows / num_shards; - size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0; - - // Allocate memory for the weight shard - std::vector
host_array(chunk_size * num_columns); - // Read the chunk - size_t total_size_read = 0; - for (int i = 0; i < num_columns; ++i) { - in.seekg((i * num_rows + offset) * sizeof(DT)); - in.read(reinterpret_cast(host_array.data() + i * chunk_size), - chunk_size * sizeof(DT)); - total_size_read += in.gcount(); - } - // Check weight shard size - size_t expected_data_size = chunk_size * num_columns * sizeof(DT); - if (total_size_read != expected_data_size) { - printf("load weight data error: expected %lu bytes, got: %lu bytes, data " - "size: %lu\n", - expected_data_size, - total_size_read, - sizeof(DT)); - assert(false); - } - assert(host_array.size() == chunk_size * num_columns); - // Copy weight to device memory - copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns); - in.close(); -} - /* regions[0](O): output regions[1](I): kernel @@ -428,162 +339,20 @@ OpMeta *LoraLinear::init_task(Task const *task, std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); - for (auto const &kv : lora->peft_configs) { - PEFTModelID const &model_id = kv.first; - LoraLinearConfig const &lora_config = kv.second; - - int rank = lora_config.rank; - - int w0_num_elements = rank * in_dim; - int w1_num_elements = rank * out_dim; - // values below represent total weight sizes before sharding. Lora B is not - // sharded. - int lora_A_num_rows = in_dim * num_shards; - int lora_A_num_cols = rank; - int lora_B_num_rows = rank; - int lora_B_num_cols = out_dim; - int lora_A_num_shards = num_shards; - int lora_B_num_shards = 1; - - LoraLinearWeight weight; - weight.in_dim = in_dim; - weight.out_dim = out_dim; - weight.rank = rank; - weight.num_shards = num_shards; - PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator; - weight.w0_ptr = allocator->allocate_local_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - weight.w1_ptr = allocator->allocate_local_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - - if (!lora_config.init_lora_weights) { - // load weights from file - std::string weights_folder_filepath = join_path({ - lora_config.cache_folder, - "weights", - lora_config.peft_model_id, - dt == DT_FLOAT ? "full-precision" : "half-precision", - }); - std::string w0_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_A.weight"}); - std::string w1_filepath = join_path( - {weights_folder_filepath, lora_layername_substr + "_B.weight"}); - if (dt == DT_FLOAT) { - std::cout << "Loading LORA weight " - << lora_layername_substr + "_A.weight" - << ", num_rows: " << lora_A_num_rows - << ", num_cols: " << lora_A_num_cols - << ", num_shards: " << lora_A_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((float *)weight.w0_ptr, - lora_A_num_rows, - lora_A_num_cols, - lora_A_num_shards, - shard_id, - w0_filepath); - std::cout << "Loading LORA weight " - << lora_layername_substr + "_B.weight" - << ", num_rows: " << lora_B_num_rows - << ", num_cols: " << lora_B_num_cols - << ", num_shards: " << lora_B_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((float *)weight.w1_ptr, - lora_B_num_rows, - lora_B_num_cols, - lora_B_num_shards, - shard_id, - w1_filepath); - } else if (dt == DT_HALF) { - std::cout << "Loading LORA weight " - << lora_layername_substr + "_A.weight" - << ", num_rows: " << lora_A_num_rows - << ", num_cols: " << lora_A_num_cols - << ", num_shards: " << lora_A_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((half *)weight.w0_ptr, - lora_A_num_rows, - lora_A_num_cols, - lora_A_num_shards, - shard_id, - w0_filepath); - std::cout << "Loading LORA weight " - << lora_layername_substr + "_B.weight" - << ", num_rows: " << lora_B_num_rows - << ", num_cols: " << lora_B_num_cols - << ", num_shards: " << lora_B_num_shards - << ", shard_id: " << shard_id << std::endl; - load_peft_from_file((half *)weight.w1_ptr, - lora_B_num_rows, - lora_B_num_cols, - lora_B_num_shards, + // allocate space for lora weights + Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); + m->peft_memory_manager = + new PEFTMemoryManager(gpu_mem, + lora->max_rank, + lora->max_concurrent_adapters, + BatchConfig::max_sequence_length(), + in_dim, + out_dim, + num_shards, shard_id, - w1_filepath); - } else { - assert(false && "Data type not supported"); - } - } else { - // initialize weights - int seed = 0; - init_kernel_wrapper(m, seed); - } - - // allocate space for gradients if the LoRA layer is trainable - if (lora_config.trainable) { - // Ensure we have an optimizer - assert(lora_config.optimizer_config != nullptr && "Optimizer not set"); - assert(typeid(*lora_config.optimizer_config) != - typeid(LoraOptimizerConfig) && - "Optimizer config is not a subclass of LoraOptimizerConfig"); - if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { - // Input is partitioned (no replication) - // w0_grad is local weight gradients - weight.w0_grad_ptr = allocator->allocate_local_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is sync weight gradients - weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } else { - // Input is replicated - // w0_grad is sync weight gradients - weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - // w1_grad is local weight gradients - weight.w1_grad_ptr = allocator->allocate_local_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } - // allocate space for v_values if needed by optimizer - if (typeid(*lora_config.optimizer_config) == - typeid(LoraSGDOptimizerConfig)) { - LoraSGDOptimizerConfig const *sgd_config = - static_cast( - lora_config.optimizer_config); - if (sgd_config->momentum > 0.0f) { - if (lora->inputs[0]->dims[num_dims - 1].degree == 1) { - weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } else { - weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped( - model_id, w0_num_elements * data_type_size(dt)); - weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped( - model_id, w1_num_elements * data_type_size(dt)); - } - } - } else if (typeid(*lora_config.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - assert(false && "Adam optim not yet implemented"); - } else { - assert(false && "Optimizer not supported"); - } - } - assert(m->model_state.find(model_id) == m->model_state.end()); - m->model_state[model_id].weights = weight; - m->model_state[model_id].optimizer_config = lora_config.optimizer_config; - m->model_state[model_id].lora_alpha = lora_config.lora_alpha; - m->model_state[model_id].cache_folder = lora_config.cache_folder; - m->model_state[model_id].peft_model_id = lora_config.peft_model_id; - } + lora_layername_substr, + dt); + m->peft_memory_manager->allocate_inference_memory(); return m; } @@ -655,8 +424,8 @@ void LoraLinear::inference_task(Task const *task, m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime); GenericTensorAccessorW output = helperGetGenericTensorAccessorRW( m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime); - // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; + int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1; // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); @@ -707,12 +476,20 @@ void LoraLinear::inference_task(Task const *task, assert(false); } - int rank, num_tokens; - for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { - PEFTModelID peft_model_id = it->first; - LoraLinearWeight weight = m->model_state[peft_model_id].weights; - rank = weight.rank; - num_tokens = input.domain.get_volume() / weight.in_dim; + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + continue; + } + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); fs::path dst_filepath_weights = get_dst_folder("weights", m->decoding_step, shard_id) / layername; std::string filenameA = @@ -721,21 +498,38 @@ void LoraLinear::inference_task(Task const *task, dst_filepath_weights.string() + ".weight_B.original"; if (m->input_type[0] == DT_FLOAT) { save_tensor((float *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filenameA.c_str()); save_tensor((float *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filenameB.c_str()); } else if (m->input_type[0] == DT_HALF) { save_tensor((half *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filenameA.c_str()); save_tensor((half *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filenameB.c_str()); } else { assert(false && "Data type not supported"); } + + if (bc->requestsInfo[i].peft_bwd) { + int num_tokens = input.domain.get_volume() / in_dim; + // input activation (intermediate) + filename = dst_filepath.string() + ".low_rank_activation"; + if (output.data_type == DT_FLOAT) { + save_tensor((float *)weight.low_rank_activation, + lora_config.rank * num_tokens, + filename.c_str()); + } else if (output.data_type == DT_HALF) { + save_tensor((half *)weight.low_rank_activation, + lora_config.rank * num_tokens, + filename.c_str()); + } else { + assert(false); + } + } } filename = dst_filepath.string() + ".output_0"; @@ -749,21 +543,6 @@ void LoraLinear::inference_task(Task const *task, assert(false); } - if (bc->num_active_peft_tokens() > 0) { - // input activation (intermediate) - filename = dst_filepath.string() + ".low_rank_activation"; - if (output.data_type == DT_FLOAT) { - save_tensor((float *)m->low_rank_activation, - rank * num_tokens, - filename.c_str()); - } else if (output.data_type == DT_HALF) { - save_tensor((half *)m->low_rank_activation, - rank * num_tokens, - filename.c_str()); - } else { - assert(false); - } - } m->decoding_step++; } } @@ -819,6 +598,8 @@ void lora_inference_debugging(LoraLinearMeta *m, GenericTensorAccessorW input_grad, GenericTensorAccessorR output_grad, int shard_id) { + int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; + int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; // get layer name std::string lora_layername = std::string(m->op_name); std::string searchString = "lora"; @@ -852,10 +633,22 @@ void lora_inference_debugging(LoraLinearMeta *m, // weights, weights gradients fs::path dst_filepath_weights = get_dst_folder("weights", m->bwd_step, shard_id) / layername; - assert(m->model_state.size() >= 1 && "Model state empty!"); - for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) { - PEFTModelID peft_model_id = it->first; - LoraLinearWeight weight = m->model_state[peft_model_id].weights; + + for (int i = 0; i < bc->max_requests_per_batch(); i++) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { + continue; + } + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { + continue; + } + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); std::string filename_weight_A = dst_filepath_weights.string() + ".weight_A.finetuned"; std::string filename_weight_B = @@ -867,36 +660,36 @@ void lora_inference_debugging(LoraLinearMeta *m, if (m->input_type[0] == DT_FLOAT) { // weight A save_tensor((float *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_weight_A.c_str()); // weight grad A save_tensor((float *)weight.w0_grad_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_grad_A.c_str()); // weight B save_tensor((float *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_weight_B.c_str()); // weight grad B save_tensor((float *)weight.w1_grad_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_grad_B.c_str()); } else if (m->input_type[0] == DT_HALF) { // weight A save_tensor((half *)weight.w0_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_weight_A.c_str()); // weight grad A save_tensor((half *)weight.w0_grad_ptr, - weight.rank * weight.in_dim, + lora_config.rank * in_dim, filename_grad_A.c_str()); // weight B save_tensor((half *)weight.w1_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_weight_B.c_str()); // weight grad B save_tensor((half *)weight.w1_grad_ptr, - weight.rank * weight.out_dim, + lora_config.rank * out_dim, filename_grad_B.c_str()); } else { assert(false && "Data type not supported"); @@ -975,62 +768,50 @@ void save_peft_weights_if_needed(LoraLinearMeta *m, } std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); + for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i]) { - continue; - } - // Skip non-PEFT requests - if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { + if (bc->request_completed[i] || + bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || + !bc->requestsInfo[i].peft_bwd) { continue; } - // Skip PEFT forward-only requests - if (!bc->requestsInfo[i].peft_bwd) { + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { continue; } if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) { - assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) != - m->model_state.end()); std::string weight_export_folder = join_path({ - m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder, + lora_config.cache_folder, "finetuned_models", - m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id, + lora_config.peft_model_id, "weights", "shard_" + std::to_string(shard_id), }); fs::create_directories(weight_export_folder); - int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank; + int rank = lora_config.rank; int w0_num_elements = rank * in_dim; int w1_num_elements = rank * out_dim; std::string w0_filepath = join_path( {weight_export_folder, lora_layername_substr + "_A.weight"}); std::string w1_filepath = join_path( {weight_export_folder, lora_layername_substr + "_B.weight"}); + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); if (m->input_type[0] == DT_FLOAT) { - save_peft_to_file( - (float *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w0_ptr, - w0_num_elements, - w0_filepath); + save_peft_to_file((float *)weight.w0_ptr, w0_num_elements, w0_filepath); if (shard_id == 0) { save_peft_to_file( - (float *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w1_ptr, - w1_num_elements, - w1_filepath); + (float *)weight.w1_ptr, w1_num_elements, w1_filepath); } } else if (m->input_type[0] == DT_HALF) { - save_peft_to_file( - (half *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w0_ptr, - w0_num_elements, - w0_filepath); + save_peft_to_file((half *)weight.w0_ptr, w0_num_elements, w0_filepath); if (shard_id == 0) { save_peft_to_file( - (half *)m->model_state[bc->requestsInfo[i].peft_model_id] - .weights.w1_ptr, - w1_num_elements, - w1_filepath); + (half *)weight.w1_ptr, w1_num_elements, w1_filepath); } } else { assert(false && "Data type not supported"); @@ -1065,7 +846,8 @@ void LoraLinear::peft_bwd_task(Task const *task, int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; // int num_infr_tokens = bc->num_active_infr_tokens(); // int num_peft_tokens = bc->num_active_peft_tokens(); - peft_bwd_kernel_wrapper(ctx, runtime, m, bc, input_grad, output_grad); + peft_bwd_kernel_wrapper( + ctx, runtime, m, bc, shard_id, input_grad, output_grad); save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id); @@ -1098,14 +880,9 @@ bool LoraLinear::measure_operator_cost(Simulator *sim, } bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) { - if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type && - lhs.peft_configs.size() == rhs.peft_configs.size()) { - for (auto const &kv : lhs.peft_configs) { - auto it = rhs.peft_configs.find(kv.first); - if (it == rhs.peft_configs.end() || !(it->second == kv.second)) { - return false; - } - } + if (lhs.layer_guid == rhs.layer_guid && lhs.max_rank == rhs.max_rank && + lhs.max_concurrent_adapters == rhs.max_concurrent_adapters && + strcmp(lhs.name, rhs.name) == 0) { return true; } return false; @@ -1144,48 +921,8 @@ void LoraLinear::serialize(Legion::Serializer &sez) const { sez.serialize(this->layer_guid.id); sez.serialize(this->layer_guid.transformer_layer_id); sez.serialize(this->layer_guid.model_id); - sez.serialize(this->op_type); - sez.serialize(this->peft_configs.size()); - for (auto const &kv : this->peft_configs) { - // Serialize PEFTModelID - sez.serialize(kv.first.id); - - // Serialize LoraLinearConfig and OptimizerConfig to tmp folder - // 1. Create tmp dir and serialize it - fs::path unique_temp_dir = create_unique_temp_directory(); - serialize_string(sez, unique_temp_dir.string()); - // 2. Dump LoraLinearConfig to json file in tmp dir - std::string lora_config_filename = std::string("lora_linear_config_") + - std::to_string(kv.first.id) + - std::string(".json"); - fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; - serialize_to_json_file(kv.second, lora_config_json_filepath); - // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type - std::string optimizer_filename = std::string("optimizer_config_") + - std::to_string(kv.first.id) + - std::string(".json"); - fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; - assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr)); - if (kv.second.trainable) { - if (typeid(*kv.second.optimizer_config) == - typeid(LoraSGDOptimizerConfig)) { - sez.serialize(OPTIMIZER_TYPE_SGD); - LoraSGDOptimizerConfig const *sgd_config = - static_cast( - kv.second.optimizer_config); - serialize_to_json_file(*sgd_config, optim_config_filepath); - } else if (typeid(*kv.second.optimizer_config) == - typeid(LoraAdamOptimizerConfig)) { - sez.serialize(OPTIMIZER_TYPE_ADAM); - LoraAdamOptimizerConfig const *adam_config = - static_cast( - kv.second.optimizer_config); - serialize_to_json_file(*adam_config, optim_config_filepath); - } else { - assert(false && "Optimizer type not yet supported"); - } - } - } + sez.serialize(this->max_rank); + sez.serialize(this->max_concurrent_adapters); sez.serialize(strlen(this->name)); sez.serialize(this->name, strlen(this->name)); } @@ -1198,8 +935,9 @@ Node LoraLinear::deserialize(FFModel &ff, int num_inputs) { assert(num_inputs == 2); size_t id, transformer_layer_id, deserialized_model_id; - OperatorType op_type; - size_t num_pefts; + int max_rank, max_concurrent_adapters; + // OperatorType op_type; + // size_t num_pefts; size_t name_len; char name[MAX_OPNAME] = {0}; @@ -1208,62 +946,16 @@ Node LoraLinear::deserialize(FFModel &ff, dez.deserialize(id); dez.deserialize(transformer_layer_id); dez.deserialize(deserialized_model_id); - dez.deserialize(op_type); - dez.deserialize(num_pefts); - for (int i = 0; i < num_pefts; i++) { - // Deserialize PEFTModelID - size_t pid; - dez.deserialize(pid); - PEFTModelID peft_model_id(pid); - // Deserialize tmp folder containing LoraLinearConfig and optimizer config - fs::path unique_temp_dir = fs::path(deserialize_string(dez)); - // 1. Deserialize LoraLinearConfig - std::string lora_config_filename = std::string("lora_linear_config_") + - std::to_string(pid) + - std::string(".json"); - fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename; - std::unique_ptr lora_linear_config = - deserialize_from_json_file(lora_config_json_filepath); - // 2. Deserialize optimizer if needed - if (lora_linear_config->trainable) { - std::string optimizer_filename = std::string("optimizer_config_") + - std::to_string(pid) + - std::string(".json"); - fs::path optim_config_filepath = unique_temp_dir / optimizer_filename; - OptimizerType type_; - dez.deserialize(type_); - if (type_ == OPTIMIZER_TYPE_SGD) { - std::unique_ptr sgd_optimizer_config = - deserialize_from_json_file( - optim_config_filepath); - lora_linear_config->optimizer_config = - dynamic_cast(sgd_optimizer_config.release()); - } else if (type_ == OPTIMIZER_TYPE_ADAM) { - std::unique_ptr adam_optimizer_config = - deserialize_from_json_file( - optim_config_filepath); - lora_linear_config->optimizer_config = - dynamic_cast( - adam_optimizer_config.release()); - } else { - printf("Optimizer type: %d\n", type_); - assert(false && "Optimizer type not yet supported"); - } - } - try { - fs::remove_all(unique_temp_dir); - } catch (fs::filesystem_error const &e) { - std::cerr << "Error removing tmp directory: " << e.what() << std::endl; - } - params.peft_configs.emplace( - std::make_pair(peft_model_id, *lora_linear_config)); - } + dez.deserialize(max_rank); + dez.deserialize(max_concurrent_adapters); dez.deserialize(name_len); dez.deserialize(name, name_len); LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); params.layer_guid = layer_guid; - params.type = op_type; + // params.type = op_type; + params.max_rank = max_rank; + params.max_concurrent_adapters = max_concurrent_adapters; strcpy(params.name, name); return ff.get_or_create_node({inputs[0], inputs[1]}, params); } @@ -1278,11 +970,13 @@ Op *LoraLinear::materialize(FFModel &ff, LoraLinearParams LoraLinear::get_params() const { LoraLinearParams params; params.layer_guid = this->layer_guid; - params.type = this->op_type; + params.max_rank = this->max_rank; + params.max_concurrent_adapters = this->max_concurrent_adapters; + // params.type = this->op_type; if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); } - params.peft_configs = this->peft_configs; + // params.peft_configs = this->peft_configs; return params; } @@ -1301,17 +995,8 @@ size_t hash::operator()( hash_combine(key, params.layer_guid.id); hash_combine(key, params.layer_guid.transformer_layer_id); hash_combine(key, params.layer_guid.model_id); - for (auto const &kv : params.peft_configs) { - hash_combine(key, kv.first.id); - hash_combine(key, kv.second.rank); - hash_combine(key, kv.second.trainable); - hash_combine(key, kv.second.cache_folder); - hash_combine(key, kv.second.peft_model_id); - hash_combine(key, kv.second.lora_alpha); - hash_combine(key, kv.second.lora_dropout); - hash_combine(key, kv.second.target_modules); - hash_combine(key, kv.second.init_lora_weights); - } + hash_combine(key, params.max_rank); + hash_combine(key, params.max_concurrent_adapters); return key; } }; // namespace std diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 6e0c60e057..69c0081ec9 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -12,6 +12,17 @@ namespace FlexFlow { // empty optimizer LoraOptimizerConfig::LoraOptimizerConfig() {} +LoraOptimizerConfig *LoraOptimizerConfig::fromJson(nlohmann::json const &j) { + std::string type = j["type"]; + if (type == "SGD") { + return LoraSGDOptimizerConfig::fromJson(j); + } + if (type == "Adam") { + return LoraAdamOptimizerConfig::fromJson(j); + } + throw std::runtime_error("Unknown optimizer type"); +} + // SGD optimizer LoraSGDOptimizerConfig::LoraSGDOptimizerConfig() : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {} @@ -30,6 +41,24 @@ std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) { return os; } +nlohmann::json LoraSGDOptimizerConfig::toJson() const { + return {{"type", "SGD"}, + {"lr", lr}, + {"momentum", momentum}, + {"nesterov", nesterov}, + {"weight_decay", weight_decay}}; +} + +LoraSGDOptimizerConfig * + LoraSGDOptimizerConfig::fromJson(nlohmann::json const &j) { + LoraSGDOptimizerConfig *sgd = new LoraSGDOptimizerConfig(); + sgd->lr = j["lr"]; + sgd->momentum = j["momentum"]; + sgd->nesterov = j["nesterov"]; + sgd->weight_decay = j["weight_decay"]; + return sgd; +} + // Adam optimizer LoraAdamOptimizerConfig::LoraAdamOptimizerConfig() : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f), @@ -50,38 +79,26 @@ std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) { return os; } -// Serialization helpers -template -void serialize_to_json_file(T const &obj, fs::path const &filepath) { - json j = obj; - std::ofstream file(filepath); - file << j.dump(4); +nlohmann::json LoraAdamOptimizerConfig::toJson() const { + return {{"type", "Adam"}, + {"alpha", alpha}, + {"beta1", beta1}, + {"beta2", beta2}, + {"weight_decay", weight_decay}, + {"epsilon", epsilon}}; } -template -std::unique_ptr deserialize_from_json_file(fs::path const &filepath) { - std::ifstream file(filepath); - json j; - file >> j; - return std::make_unique(j.get()); +LoraAdamOptimizerConfig * + LoraAdamOptimizerConfig::fromJson(nlohmann::json const &j) { + LoraAdamOptimizerConfig *adam = new LoraAdamOptimizerConfig(); + adam->alpha = j["alpha"]; + adam->beta1 = j["beta1"]; + adam->beta2 = j["beta2"]; + adam->weight_decay = j["weight_decay"]; + adam->epsilon = j["epsilon"]; + return adam; } -template void - serialize_to_json_file(LoraLinearConfig const &obj, - fs::path const &filepath); -template void serialize_to_json_file( - LoraSGDOptimizerConfig const &obj, fs::path const &filepath); -template void serialize_to_json_file( - LoraAdamOptimizerConfig const &obj, fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file(fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file( - fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file( - fs::path const &filepath); - // ------------------ LoRA configs ------------------- // --------------------------------------------------- const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", ""); @@ -218,4 +235,76 @@ std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) { return os; } -}; // namespace FlexFlow +double ToThreeDecimalPlaces(float f) { + double d = static_cast(f); + int i; + if (d >= 0) { + i = static_cast(d * 1000 + 0.5); + } else { + i = static_cast(d * 1000 - 0.5); + } + return (i / 1000.0); +} + +std::string LoraLinearConfig::serialize_to_json_string(int indent) const { + nlohmann::json j = {{"cache_folder", cache_folder}, + {"peft_model_id", peft_model_id}, + {"rank", rank}, + {"lora_alpha", ToThreeDecimalPlaces(lora_alpha)}, + {"lora_dropout", ToThreeDecimalPlaces(lora_dropout)}, + {"target_modules", target_modules}, + {"trainable", trainable}, + {"init_lora_weights", init_lora_weights}, + {"base_model_name_or_path", base_model_name_or_path}, + {"precision", precision}, + {"optimizer_config", + optimizer_config + ? nlohmann::json(optimizer_config->toJson()) + : nlohmann::json()}}; + + return j.dump(indent); // No indentation +} + +void LoraLinearConfig::serialize_to_json_file( + std::string const &filename) const { + std::string j = serialize_to_json_string(4); + std::ofstream file(filename); + file << j; +} + +// Deserialization method +LoraLinearConfig LoraLinearConfig::deserialize_from_json_string( + std::string const &json_string) { + // std::cout << "Attempting to deserialize from JSON string: " << json_string + // << std::endl; + nlohmann::json j = nlohmann::json::parse(json_string); + LoraOptimizerConfig *optimizer_config_ = nullptr; + if (!j["optimizer_config"].is_null()) { + optimizer_config_ = LoraOptimizerConfig::fromJson(j["optimizer_config"]); + } + LoraLinearConfig config = LoraLinearConfig::EmptyConfig; + config.cache_folder = j["cache_folder"].get(); + config.peft_model_id = j["peft_model_id"].get(); + config.rank = j["rank"].get(); + config.lora_alpha = j["lora_alpha"].get(); + config.lora_dropout = j["lora_dropout"].get(); + config.target_modules = j["target_modules"].get>(); + config.trainable = j["trainable"].get(); + config.init_lora_weights = j["init_lora_weights"].get(); + config.base_model_name_or_path = + j["base_model_name_or_path"].get(); + config.precision = j["precision"].get(); + config.optimizer_config = optimizer_config_; + return config; +} + +// Deserialization method +LoraLinearConfig + LoraLinearConfig::deserialize_from_json_file(std::string const &filename) { + std::ifstream file(filename); + std::string j; + file >> j; + return deserialize_from_json_string(j); +} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc index 8213726e8a..31937cef66 100644 --- a/src/runtime/fftype.cc +++ b/src/runtime/fftype.cc @@ -46,6 +46,10 @@ bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) { return lhs.id == rhs.id; } +bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs) { + return !(lhs == rhs); +} + std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) { if (peft_model_id == PEFTModelID::NO_ID) { os << "NO_ID"; diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index e73893475c..3ebe6cf095 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -16,6 +16,7 @@ #include "flexflow/utils/file_loader.h" #include "flexflow/ffconst_utils.h" #include "flexflow/inference.h" +#include "flexflow/model.h" #include using namespace std; @@ -851,35 +852,70 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, delete data; } -void FileDataLoader::load_weights(FFModel *ff) { +void FileDataLoader::load_weight_task( + Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime) { + WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; + + switch (args->data_type) { + case DT_HALF: { + args->loader->load_single_weight_tensor( + args->ff, args->layer, args->weight_idx); + break; + } + case DT_FLOAT: { + args->loader->load_single_weight_tensor( + args->ff, args->layer, args->weight_idx); + break; + } + case DT_INT4: + case DT_INT8: { + args->loader->load_quantization_weight( + args->ff, args->layer, args->weight_idx); + break; + } + default: + assert(false && "Unsupported data type"); + } +} + +void FileDataLoader::load_weights_parallel(FFModel *ff, + Context ctx, + Runtime *runtime) { + std::vector futures; + for (Layer *l : ff->layers) { if (l->numWeights < 1 || l->name == NULL || strlen(l->name) < 1) { continue; } + for (int i = 0; i < l->numWeights; i++) { Tensor weight = l->weights[i]; if (weight == NULL) { continue; } - // TODO: currently skip Lora layers + if (l->op_type == OP_LORA) { continue; } - switch (weight->data_type) { - case DT_HALF: - load_single_weight_tensor(ff, l, i); - break; - case DT_FLOAT: - load_single_weight_tensor(ff, l, i); - break; - case DT_INT4: - case DT_INT8: - // load weights in quantization - load_quantization_weight(ff, l, i); - break; - default: - assert(false && "Unsupported data type"); + + if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF && + weight->data_type != DT_INT4 && weight->data_type != DT_INT8) { + assert(false && "Unsupported data type"); } + + // Create task arguments + WeightLoadTaskArgs args(ff, this, l, i, weight->data_type); + TaskLauncher launcher(LOAD_WEIGHT_TASK_ID, + TaskArgument(&args, sizeof(WeightLoadTaskArgs))); + futures.push_back(runtime->execute_task(ctx, launcher)); } } + + // Wait for all tasks to complete + for (Future &f : futures) { + f.get_void_result(); + } } diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index f39ea91f28..45b6ba0db8 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -273,7 +273,9 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { } reset_inputs.insert(op->inputs[i]->region); } else { - reset_inputs.insert(op->inputs[i]->region); + if (op->op_type != OP_LORA) { + reset_inputs.insert(op->inputs[i]->region); + } } } } diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 417cd2c056..2a95caf6cb 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -1550,8 +1550,6 @@ FFRuntime::FFRuntime(FFConfig &config) { config.cpu_offload ? config.offload_reserve_space_size : 0; info.peft_activation_reserve_space_size = config.enable_peft ? config.peft_activation_reserve_space_size : 0; - info.peft_weight_reserve_space_size = - config.enable_peft ? config.peft_weight_reserve_space_size : 0; info.quantization_type = config.quantization_type; info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion; argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo))); @@ -3423,62 +3421,29 @@ bool FFModel::need_to_add_combine(int layer_idx) const { bool FFModel::need_to_add_allreduce(int layer_idx) const { auto const &l = layers[layer_idx]; if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - ( - // l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION || - // l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION || - (std::string(l->name).find("attn.o_proj") != std::string::npos) || - // mlp layer - is_mlp_block(layer_idx) || - // llama mlp layer - (l->op_type == OP_LINEAR && layer_idx >= 2 && - layers[layer_idx - 1]->op_type == OP_GELU && - layers[layer_idx - 2]->op_type == OP_LINEAR) || - // LLAMA without element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 5 && - layers[layer_idx - 1]->op_type == OP_EW_MUL && - layers[layer_idx - 2]->op_type == OP_EW_MUL && - layers[layer_idx - 3]->op_type == OP_SIGMOID && - layers[layer_idx - 4]->op_type == OP_LINEAR && - layers[layer_idx - 5]->op_type == OP_LINEAR) || - // LLAMA with element-wise operator fusion - (l->op_type == OP_LINEAR && layer_idx >= 3 && - layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI && - layers[layer_idx - 2]->op_type == OP_LINEAR && - layers[layer_idx - 3]->op_type == OP_LINEAR))) { + config.tensor_parallelism_degree > 1 && l->op_type == OP_LINEAR && + (/*llama/mpt attention*/ + (std::string(l->name).find("attn.o_proj") != std::string::npos) || + /*opt/starcoder attention*/ + (std::string(l->name).find("self_attn.o_proj") != std::string::npos) || + /*falcon attention*/ + (std::string(l->name).find("self_attention.o_proj") != + std::string::npos) || + /*llama mlp*/ + (std::string(l->name).find("mlp.down_proj") != std::string::npos) || + /*opt mlp*/ + (std::string(l->name).find("fc2") != std::string::npos) || + /*falcon mlp*/ + (std::string(l->name).find("mlp.dense_4h_to_h") != std::string::npos) || + /*mpt mlp*/ + (std::string(l->name).find("ffn.down_proj") != std::string::npos) || + /*starcoder mlp*/ + (std::string(l->name).find("mlp.c_proj") != std::string::npos))) { return true; } return false; } -#ifdef DEADCODE -bool FFModel::need_to_add_parallel_identity(int layer_idx) const { - auto const &l = layers[layer_idx]; - // add parallel identity (allreduce in the backward pass) before the lm head - // we find the lm head by looking for the linear layer right after a residual - // rms norm / layer norm, and before a softmax, followed by - // argmax/argtopk/sampling - if (config.computationMode == COMP_MODE_INFERENCE && - config.tensor_parallelism_degree > 1 && - ((l->op_type == OP_RESIDUAL_RMS_NORM || - l->op_type == OP_RESIDUAL_LAYERNORM) && - // there are at least 2 layers before the norm, and at least 3 following - // the norm - layer_idx >= 2 && layer_idx < layers.size() - 3 && - // norm is followed by linear layer (lm head) - layers[layer_idx + 1]->op_type == OP_LINEAR && - // lm head is followed by softmax - layers[layer_idx + 2]->op_type == OP_SOFTMAX && - // softmax is followed by argmax/argtopk/sampling - (layers[layer_idx + 3]->op_type == OP_ARG_TOPK || - layers[layer_idx + 3]->op_type == OP_SAMPLING || - layers[layer_idx + 3]->op_type == OP_ARGMAX || - layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) { - return true; - } - return false; -} -#endif bool FFModel::need_to_add_parallel_identity(int layer_idx) const { auto const &l = layers[layer_idx]; // add parallel identity (allreduce in the backward pass) before the lm head @@ -4400,7 +4365,6 @@ FFConfig::FFConfig() { enable_peft = DefaultConfig::enablePeft; peft_activation_reserve_space_size = DefaultConfig::peftActivationReserveSpaceSize; - peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize; quantization_type = DT_NONE; only_data_parallel = DefaultConfig::onlyDataParallel; data_parallelism_degree = 1; @@ -4535,10 +4499,6 @@ void FFConfig::parse_args(char **argv, int argc) { peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; continue; } - if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) { - peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024; - continue; - } if ((!strcmp(argv[i], "--only-data-parallel"))) { only_data_parallel = true; continue; @@ -4852,6 +4812,20 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(LOAD_WEIGHT_TASK_ID, "load_weight_task"); + registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "load_weight_task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant( + registrar); + } + } #endif // ElementUnary task { diff --git a/src/runtime/model.cu b/src/runtime/model.cu index 5dab73e1a4..3a250539c7 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -168,7 +168,7 @@ FFHandler } else { handle.batch_config_metadata = nullptr; } - + // #ifdef DEADCODE if (info->peft_activation_reserve_space_size > 0) { // allocate memory for peft activation reserve space Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -182,33 +182,8 @@ FFHandler } else { handle.peft_activation_allocator = nullptr; } - - if (info->peft_weight_reserve_space_size > 0) { - // allocate memory for peft weight reserve space - Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::GPU_FB_MEM) - .best_affinity_to(task->target_proc) - .first(); - Realm::Rect<1, coord_t> bounds( - Realm::Point<1, coord_t>(0), - Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1)); - std::vector field_sizes; - field_sizes.push_back(sizeof(char)); - Realm::RegionInstance workspaceInst; - Realm::RegionInstance::create_instance(workspaceInst, - gpu_mem, - bounds, - field_sizes, - 0, - Realm::ProfilingRequestSet()) - .wait(); - void *ptr = workspaceInst.pointer_untyped(0, sizeof(char)); - handle.peft_weight_allocator = - new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size); - } else { - handle.peft_weight_allocator = nullptr; - } - // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); +// #endif +// checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL handle.ncclComm = NULL; #endif diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc new file mode 100644 index 0000000000..1fcef3678e --- /dev/null +++ b/src/runtime/peft_weight_allocator.cc @@ -0,0 +1,319 @@ +#include "flexflow/utils/peft_weight_allocator.h" + +namespace FlexFlow { +// declare legion names +using Legion::ArgumentMap; +using Legion::Context; +using Legion::coord_t; +using Legion::Domain; +using Legion::FutureMap; +using Legion::IndexLauncher; +using Legion::InlineLauncher; +using Legion::Machine; +using Legion::Memory; +using Legion::PhysicalRegion; +using Legion::Predicate; +using Legion::Rect; +using Legion::RegionRequirement; +using Legion::Runtime; +using Legion::Task; +using Legion::TaskArgument; +using Legion::TaskLauncher; + +void PEFTMemoryManager::allocate_inference_memory() { + // allocate chunk of memory for all the PEFT adapters + Realm::Rect<1, coord_t> bounds( + Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(max_lora_size * max_concurrent_adapters - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(peftLegionInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); +} + +void PEFTMemoryManager::allocate_finetuning_memory() { + size_t ft_size = max_lora_size * 3; // weights, gradients, momentum values + ft_size += max_peft_tokens * (in_dim + max_rank) * + data_type_size(dt); // input, low-rank activations + // allocate chunk of memory for PEFT adapter + Realm::Rect<1, coord_t> bounds(Realm::Point<1, coord_t>(0), + Realm::Point<1, coord_t>(ft_size - 1)); + std::vector field_sizes; + field_sizes.push_back(sizeof(char)); + Realm::RegionInstance::create_instance(peftLegionInst, + gpu_mem, + bounds, + field_sizes, + 0, + Realm::ProfilingRequestSet()) + .wait(); + finetuning_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); +} + +void PEFTMemoryManager::get_finetuning_slot(PEFTModelID const &model_id, + bool *cache_miss) { + if (finetuning_ptr == nullptr) { + allocate_finetuning_memory(); + } + assert(finetuning_ptr != nullptr && + "PEFT Memory Manager finetuning_ptr is null"); + *cache_miss = (model_id.id != finetuning_model_id.id); + finetuning_model_id = model_id; +} + +int PEFTMemoryManager::get_inference_peft_slot(PEFTModelID const &model_id, + bool *cache_miss) { + assert(base_ptr != nullptr && "PEFT Memory Manager not initialized"); + assert(lru_hashtable.size() == lru_list.size() && + lru_list.size() == peft2mem_slot.size() && + "PEFT Memory Manager LRU hashtable/list and/or peft2mem_slot are out " + "of sync"); + // check for cache hit + if (lru_hashtable.find(model_id) != lru_hashtable.end()) { + int lru_list_index = lru_hashtable[model_id]; + assert(lru_list[lru_list_index] == model_id && + "PEFT Memory Manager LRU hashtable/list are out of sync"); + // move the model to the end of the LRU list + lru_list.erase(lru_list.begin() + lru_list_index); + lru_list.push_back(model_id); + // update the LRU hashtable + lru_hashtable[model_id] = lru_list.size() - 1; + // get memory slot + assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && + "PEFT Memory Manager peft2mem_slot is out of sync"); + *cache_miss = false; + } else { + // cache miss + // check if you need to evict + bool need_to_evict = lru_list.size() == max_concurrent_adapters; + int mem_slot = -1; + if (need_to_evict) { + // evict the least recently used model + PEFTModelID lru_model_id = lru_list[0]; + lru_list.erase(lru_list.begin()); + lru_hashtable.erase(lru_model_id); + mem_slot = peft2mem_slot[lru_model_id]; + peft2mem_slot.erase(lru_model_id); + } else { + mem_slot = lru_list.size(); + } + // update the LRU list and hashtable + lru_list.push_back(model_id); + lru_hashtable[model_id] = lru_list.size() - 1; + // update the memory slot + peft2mem_slot[model_id] = mem_slot; + *cache_miss = true; + } + assert(peft2mem_slot.find(model_id) != peft2mem_slot.end() && + "PEFT Memory Manager peft2mem_slot is out of sync"); + int slot = peft2mem_slot[model_id]; + assert(slot >= 0 && slot < max_concurrent_adapters && + "PEFT Memory Manager peft2mem_slot is out of bounds"); + return slot; +} + +template +void load_peft_from_file(DT *ptr, + size_t num_rows, + size_t num_columns, + int num_shards, + int shard_id, + std::string filepath) { + std::ifstream in(filepath, std::ios::in | std::ios::binary); + if (!in.good()) { + printf("Could not open file: %s\n", filepath.c_str()); + } + assert(in.good() && "incorrect weight file path"); + + // HuggingFace dims (serialized in row-major order) + // lora_A: [rank, intermediate_dim] + // lora_B: [hidden_dim, rank] + // FlexFlow dims (serialized in column-major order) + // lora_A: [intermediate_dim, rank] + // lora_B: [rank, out_dim] + // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B + assert(num_rows % num_shards == 0); + size_t chunk_size = num_rows / num_shards; + size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0; + + // Allocate memory for the weight shard + std::vector
host_array(chunk_size * num_columns); + // Read the chunk + size_t total_size_read = 0; + for (int i = 0; i < num_columns; ++i) { + in.seekg((i * num_rows + offset) * sizeof(DT)); + in.read(reinterpret_cast(host_array.data() + i * chunk_size), + chunk_size * sizeof(DT)); + total_size_read += in.gcount(); + } + // Check weight shard size + size_t expected_data_size = chunk_size * num_columns * sizeof(DT); + if (total_size_read != expected_data_size) { + printf("load weight data error: expected %lu bytes, got: %lu bytes, data " + "size: %lu\n", + expected_data_size, + total_size_read, + sizeof(DT)); + assert(false); + } + assert(host_array.size() == chunk_size * num_columns); + // Copy weight to device memory + copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns); + in.close(); +} + +void PEFTMemoryManager::load_peft_model(LoraLinearWeight &weight, + LoraLinearConfig const &lora_config) { + // Load weights + assert(weight.w0_ptr != nullptr && weight.w1_ptr != nullptr && + "PEFT Memory Manager weight ptr null"); + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; + // values below represent total weight sizes before sharding. Lora B is not + // sharded. + int lora_A_num_rows = in_dim * num_shards; + int lora_A_num_cols = lora_config.rank; + int lora_B_num_rows = lora_config.rank; + int lora_B_num_cols = out_dim; + int lora_A_num_shards = num_shards; + int lora_B_num_shards = 1; + if (lora_config.init_lora_weights) { + // initialize weights randomly + int seed = 0; + init_peft_weight_wrapper( + weight, in_dim, out_dim, lora_config.rank, dt, seed); + } else { + // load weights from file + std::string weights_folder_filepath = join_path({ + lora_config.cache_folder, + "weights", + lora_config.peft_model_id, + dt == DT_FLOAT ? "full-precision" : "half-precision", + }); + std::string w0_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_A.weight"}); + std::string w1_filepath = join_path( + {weights_folder_filepath, lora_layername_substr + "_B.weight"}); + if (dt == DT_FLOAT) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((float *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else if (dt == DT_HALF) { + std::cout << "Loading LORA weight " << lora_layername_substr + "_A.weight" + << ", num_rows: " << lora_A_num_rows + << ", num_cols: " << lora_A_num_cols + << ", num_shards: " << lora_A_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w0_ptr, + lora_A_num_rows, + lora_A_num_cols, + lora_A_num_shards, + shard_id, + w0_filepath); + std::cout << "Loading LORA weight " << lora_layername_substr + "_B.weight" + << ", num_rows: " << lora_B_num_rows + << ", num_cols: " << lora_B_num_cols + << ", num_shards: " << lora_B_num_shards + << ", shard_id: " << shard_id << std::endl; + load_peft_from_file((half *)weight.w1_ptr, + lora_B_num_rows, + lora_B_num_cols, + lora_B_num_shards, + shard_id, + w1_filepath); + } else { + assert(false && "Data type not supported"); + } + } +} + +LoraLinearWeight + PEFTMemoryManager::get_inference_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config) { + assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); + bool cache_miss; + int mem_slot = get_inference_peft_slot(model_id, &cache_miss); + int w0_num_elements = lora_config.rank * in_dim; + int data_size = data_type_size(dt); + LoraLinearWeight result; + result.w0_ptr = static_cast(base_ptr) + mem_slot * max_lora_size; + result.w1_ptr = + static_cast(result.w0_ptr) + w0_num_elements * data_size; + if (cache_miss) { + load_peft_model(result, lora_config); + } + return result; +} + +LoraLinearWeight PEFTMemoryManager::get_finetuning_peft( + PEFTModelID const &model_id, LoraLinearConfig const &lora_config) { + assert(model_id != PEFTModelID::NO_ID && "PEFT Model ID is not set"); + bool cache_miss; + get_finetuning_slot(model_id, &cache_miss); + int w0_num_elements = lora_config.rank * in_dim; + int w1_num_elements = lora_config.rank * out_dim; + int data_size = data_type_size(dt); + LoraLinearWeight result; + result.w0_ptr = finetuning_ptr; + result.w1_ptr = + static_cast(result.w0_ptr) + w0_num_elements * data_size; + result.w0_grad_ptr = + static_cast(result.w1_ptr) + w1_num_elements * data_size; + result.w1_grad_ptr = + static_cast(result.w0_grad_ptr) + w0_num_elements * data_size; + result.w0_v_values_ptr = + static_cast(result.w1_grad_ptr) + w1_num_elements * data_size; + result.w1_v_values_ptr = + static_cast(result.w0_v_values_ptr) + w0_num_elements * data_size; + result.input_activation = + static_cast(result.w1_v_values_ptr) + + w1_num_elements * data_size; // max_peft_tokens*in_dim + result.low_rank_activation = + static_cast(result.input_activation) + + max_peft_tokens * in_dim * data_size; // max_peft_tokens*rank + if (cache_miss) { + load_peft_model(result, lora_config); + } + return result; +} + +LoraLinearWeight + PEFTMemoryManager::get_peft(PEFTModelID const &model_id, + LoraLinearConfig const &lora_config) { + if (lora_config.trainable) { + return get_finetuning_peft(model_id, lora_config); + } else { + return get_inference_peft(model_id, lora_config); + } +} + +void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) { + assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!"); +} + +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/peft_weight_allocator.cu b/src/runtime/peft_weight_allocator.cu new file mode 100644 index 0000000000..3c4ea91db3 --- /dev/null +++ b/src/runtime/peft_weight_allocator.cu @@ -0,0 +1,80 @@ + + +#include "flexflow/ops/kernels/decompress_kernels.h" +#include "flexflow/utils/cuda_helper.h" +#include "flexflow/utils/peft_weight_allocator.h" +#include +#include +namespace FlexFlow { + +template +void lora_init_kernel(LoraLinearWeight const &weight, + int in_dim, + int out_dim, + int rank, + int seed, + cudaStream_t stream) { + // Initialize generator + std::mt19937 gen(seed); + + // Get handle to weights by iterating over m->model_state to get each + // LoraLinearWeight object + int w0_num_elements = rank * in_dim; + int w1_num_elements = rank * out_dim; + + // LoRA_A weight: [in_dim, rank] + float stdv_lora_a = 1.0f / sqrt(in_dim); + std::uniform_real_distribution dis_lora_a(-stdv_lora_a, stdv_lora_a); + std::vector
lora_a_random_init(w0_num_elements); + for (auto &num : lora_a_random_init) { + float num_float = dis_lora_a(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w0_ptr), + lora_a_random_init.data(), + w0_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); + + // LoRA_B weight: [rank, out_dim] + float stdv_lora_b = 1.0f / sqrt(rank); + std::uniform_real_distribution dis_lora_b(-stdv_lora_b, stdv_lora_b); + std::vector lora_b_random_init(w1_num_elements); + for (auto &num : lora_b_random_init) { + float num_float = dis_lora_b(gen); + if (std::is_same::value) { + num = __float2half(num_float); + } else { + num = num_float; + } + } + checkCUDA(cudaMemcpyAsync(static_cast
(weight.w1_ptr), + lora_b_random_init.data(), + w1_num_elements * sizeof(DT), + cudaMemcpyHostToDevice, + stream)); +} + +void init_peft_weight_wrapper(LoraLinearWeight const &weight, + int in_dim, + int out_dim, + int rank, + DataType dt, + int seed) { + cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); + + if (dt == DT_FLOAT) { + lora_init_kernel(weight, in_dim, out_dim, rank, seed, stream); + } else if (dt == DT_HALF) { + lora_init_kernel(weight, in_dim, out_dim, rank, seed, stream); + } else { + assert(false && "Unsupported data type"); + } +} + +} // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 193abbb455..fddaae09ce 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -263,6 +263,73 @@ size_t RequestManager::get_num_ssms() { return ssm_models.size(); } +void RequestManager::set_peft_config(PEFTModelID const &peft_model_id, + LoraLinearConfig const &peft_config) { + // check that peft_model_id is not already in use + assert(peft_configs.find(peft_model_id) == peft_configs.end() && + "PEFT model ID already in use"); + // LoraLinearConfig new_config = + // LoraLinearConfig::deserialize_from_json_string( + // peft_config.serialize_to_json_string()); + peft_configs[peft_model_id] = peft_config; +} + +LoraLinearConfig const & + RequestManager::get_peft_config(PEFTModelID const &peft_model_id) { + assert(peft_configs.find(peft_model_id) != peft_configs.end() && + "PEFT model ID not found"); + return peft_configs[peft_model_id]; +} + +void RequestManager::set_max_lora_rank(int max_lora_rank_) { + max_lora_rank = max_lora_rank_; +} + +void RequestManager::set_max_concurrent_adapters(int max_concurrent_adapters_) { + max_concurrent_adapters = max_concurrent_adapters_; +} + +int RequestManager::get_max_lora_rank() { + return max_lora_rank; +} + +int RequestManager::get_max_concurrent_adapters() { + return max_concurrent_adapters; +} + +PEFTModelID * + FFModel::register_peft_adapter(LoraLinearConfig const &peft_config) { + assert(config.enable_peft && + "Cannot add a LoRA layer if PEFT mode is not enabled"); + if (peft_config.target_modules.size() == 0) { + printf("PEFT config does not contain any target module\n"); + std::cout << peft_config << std::endl; + assert(false); + } + std::cout << "Registering PEFT adapter" + << peft_config.serialize_to_json_string() << std::endl; + // go over base_layer_to_peft_layer and check that you can find at least one + // match + for (int i = 0; i < peft_config.target_modules.size(); i++) { + bool found = false; + for (auto const &pair : base_layer_to_peft_layer) { + Layer *base_layer = pair.first; + if (base_layer->name != nullptr && strlen(base_layer->name) > 0 && + std::string(base_layer->name).find(peft_config.target_modules[0]) != + std::string::npos) { + found = true; + break; + } + } + assert(found && "Attempting to add LoRA to a LLM target module that does " + "not exist or does not support LoRA"); + } + PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++); + RequestManager *rm = RequestManager::get_request_manager(); + rm->set_peft_config(*peft_model_id, peft_config); + return peft_model_id; +} + RequestManager::RequestGuid RequestManager::register_new_request(Request const &request_) { const std::lock_guard lock(request_queue_mutex); @@ -628,6 +695,18 @@ void RequestManager::check_batch(BatchConfig const &old_bc, } } +void RequestManager::add_peft_config_to_request_info( + BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) { + std::memset(bc.requestsInfo[req_idx].peft_model_config_str, + 0, + BatchConfig::MAX_PEFT_CONFIG_SIZE); + std::string peft_config_str = peft_config.serialize_to_json_string(); + std::strcpy(bc.requestsInfo[req_idx].peft_model_config_str, + peft_config_str.c_str()); + // std::cout << "Added PEFT config to request info: " + // << bc.requestsInfo[req_idx].peft_model_config_str << std::endl; +} + BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, InferenceResult const &result) { const std::lock_guard lock(request_queue_mutex); @@ -666,6 +745,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, int inference_batch_size = BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning; + int num_concurrent_adapters = 0; + // Step 2: prepare the next batch for existing inference requests BatchConfig new_bc; for (int i = 0; i < inference_batch_size; i++) { @@ -684,6 +765,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, assert(processed_tokens < request.tokens.size()); bool request_completed = check_inf_req_completion(old_bc, i); if (request_completed) { + if (is_eos_token(request.tokens.back())) { + // remove the EOS token + request.tokens.pop_back(); + } std::string output = this->tokenizer_->Decode(request.tokens); // Unlike Huggingface, the sentencepiece C++ library automatically // removes the BOS token @@ -760,6 +845,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid; new_bc.requestsInfo[i].peft_model_id = old_bc.requestsInfo[i].peft_model_id; + std::strcpy(new_bc.requestsInfo[i].peft_model_config_str, + old_bc.requestsInfo[i].peft_model_config_str); + if (old_bc.requestsInfo[i].peft_model_id != PEFTModelID::NO_ID) { + num_concurrent_adapters += 1; + } new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd; new_bc.requestsInfo[i].max_length = old_bc.requestsInfo[i].max_length; num_active_req++; @@ -811,6 +901,9 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, } new_bc.num_generation_tokens = num_generation_tokens; + assert(num_concurrent_adapters <= get_max_concurrent_adapters() && + "Number of concurrent adapters exceeded the limit"); + // Step 3: add new inference requests to the next batch if there is space for (int i = 0; i < inference_batch_size; i++) { if (new_bc.request_completed[i]) { @@ -818,6 +911,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_tokens < get_max_tokens_per_batch()) { Request new_request = pending_infr_request_queue.front(); assert(new_request.req_type == RequestType::REQ_INFERENCE); + + // if the request has peft adapters and we are at capacity, don't add it + // yet + if (new_request.peft_model_id != PEFTModelID::NO_ID && + num_concurrent_adapters == get_max_concurrent_adapters()) { + break; + } + pending_infr_request_queue.pop(); // all_requests[new_request.guid] = new_request; @@ -829,6 +930,10 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, (int)new_request.tokens.size()); new_bc.requestsInfo[i].max_length = new_request.max_length; new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id; + if (new_request.peft_model_id != PEFTModelID::NO_ID) { + add_peft_config_to_request_info( + new_bc, i, get_peft_config(new_request.peft_model_id)); + } new_bc.requestsInfo[i].peft_bwd = false; new_bc.request_completed[i] = false; new_bc.requestsInfo[i].prompt_phase = true; @@ -983,7 +1088,8 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, int num_peft_label_tokens = request.dataset[dataset_entry].second.size(); assert(num_peft_label_tokens == 0); - if (num_peft_tokens > 0) { + if (num_peft_tokens > 0 && + num_concurrent_adapters < get_max_concurrent_adapters()) { assert(new_bc.request_completed[inference_batch_size]); // request info new_bc.request_completed[inference_batch_size] = false; @@ -995,9 +1101,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, num_peft_tokens; new_bc.requestsInfo[inference_batch_size].max_length = request.max_length; new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; + new_bc.requestsInfo[inference_batch_size].peft_bwd = true; new_bc.requestsInfo[inference_batch_size].peft_model_id = request.peft_model_id; - new_bc.requestsInfo[inference_batch_size].peft_bwd = true; + add_peft_config_to_request_info( + new_bc, inference_batch_size, get_peft_config(request.peft_model_id)); set_optimizer_tasks( new_bc.requestsInfo[inference_batch_size].optimizer_tasks, request.max_training_steps, @@ -1015,8 +1123,11 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, new_bc.num_tokens++; new_bc.num_peft_tokens++; } + num_concurrent_adapters += 1; } } + assert(num_concurrent_adapters <= get_max_concurrent_adapters() && + "Number of concurrent adapters exceeded the limit"); return new_bc; } @@ -2914,7 +3025,7 @@ void RequestManager::serve_incr_decoding(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[llm]->load_weights(llm); + im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime); // init operators im->init_operators_inference(llm); // Legion futures for inc_decoding and spec_infer @@ -2976,7 +3087,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[llm]->load_weights(llm); + im->model_weights_loaders[llm]->load_weights_parallel(llm, ctx, runtime); // init operators im->init_operators_inference(llm); } @@ -2987,7 +3098,7 @@ void RequestManager::serve_spec_infer(FFModel *llm) { assert(im->model_weights_loaders.find(llm) != im->model_weights_loaders.end()); // Load model weights - im->model_weights_loaders[ssm]->load_weights(ssm); + im->model_weights_loaders[ssm]->load_weights_parallel(ssm, ctx, runtime); // init operators im->init_operators_inference(ssm); } diff --git a/tests/inference/huggingface_inference_simple.py b/tests/inference/huggingface_inference_simple.py new file mode 100644 index 0000000000..f1cf8450b7 --- /dev/null +++ b/tests/inference/huggingface_inference_simple.py @@ -0,0 +1,51 @@ +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + GenerationConfig, +) + +model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct" +do_sample = False +max_length = 128 +model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto",) +hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +generation_config = GenerationConfig.from_pretrained(model_name) +print(generation_config.do_sample) +generation_config.do_sample = do_sample +generation_config.num_beams=1 +generation_config.temperature = None +generation_config.top_p = None + + +def run_text_completion(): + prompt = "Help me plan a 1-week trip to Dubai" + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) + + generated = model.generate( + batch["input_ids"], + max_new_tokens=max_length, + generation_config=generation_config, + ) + out = tokenizer.decode(generated[0]) + print(out) + +def run_chat_completion(): + messages=[ + {"role": "system", "content": "You are a helpful an honest programming assistant."}, + {"role": "user", "content": "Is Rust better than Python?"}, + ] + tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + batch = tokenizer(tokenized_chat, return_tensors="pt") + + generated = model.generate( + batch["input_ids"], + max_new_tokens=max_length, + generation_config=generation_config, + ) + out = tokenizer.decode(generated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) + prompt_length = len(tokenizer.decode(batch["input_ids"][0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) + all_text = out[prompt_length:] + print(all_text) +run_chat_completion() \ No newline at end of file diff --git a/tests/inference/huggingface_pipeline.py b/tests/inference/huggingface_pipeline.py new file mode 100644 index 0000000000..95388e0a4b --- /dev/null +++ b/tests/inference/huggingface_pipeline.py @@ -0,0 +1,33 @@ +import transformers +from transformers import GenerationConfig + +model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" +do_sample = False + +generation_config = GenerationConfig.from_pretrained(model_id) +generation_config.do_sample = do_sample +generation_config.num_beams=1 +# generation_config.max_length = 128 +generation_config.temperature = None +generation_config.top_p = None +print(generation_config) + +pipeline = transformers.pipeline( + "text-generation", + model=model_id, + # model_kwargs={"torch_dtype": torch.bfloat16}, + device_map="auto", +) + +messages=[ + {"role": "system", "content": "You are a helpful an honest programming assistant."}, + {"role": "user", "content": "Is Rust better than Python?"}, + ] + +# messages="Help me plan a 1-week trip to Dubai" +outputs = pipeline( + messages, + max_new_tokens=128, + generation_config=generation_config, +) +print(outputs[0]["generated_text"][-1]['content']) \ No newline at end of file diff --git a/tests/inference/inference_alignment_test.py b/tests/inference/inference_alignment_test.py index 8dab7ff43b..1fe2bfbaae 100644 --- a/tests/inference/inference_alignment_test.py +++ b/tests/inference/inference_alignment_test.py @@ -361,7 +361,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)[:,:,-1].squeeze() hf_tensor = hf_tensor.squeeze() - print(hf_tensor.shape, ff_tensor.shape) + # print(hf_tensor.shape, ff_tensor.shape) compare(hf_tensor, ff_tensor, label="LM head input") output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 2720304d4f..afb7ffb9a7 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -8,8 +8,8 @@ "memory_per_gpu": 14000, "zero_copy_memory_per_node": 40000, # optional parameters - "num_cpus": 4, - "legion_utility_processors": 4, + "num_cpus": 8, + "legion_utility_processors": 8, "data_parallelism_degree": 1, "tensor_parallelism_degree": 1, "pipeline_parallelism_degree": 4, @@ -19,7 +19,6 @@ "use_8bit_quantization": False, "enable_peft": False, "peft_activation_reserve_space_size": 1024, # 1GB - "peft_weight_reserve_space_size": 1024, # 1GB "profiling": False, "benchmarking": False, "inference_debugging": False, @@ -63,15 +62,14 @@ # starcoder_models = ["bigcode/starcoderbase-7b",] parallelism_settings = [(1, 4), (2, 2), (4, 1)] -# The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference) -prompt_file = "../../inference/prompt/test.json" -output_folder = "../../inference/output" - # Change working dir to folder storing this script abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) +prompt_file = os.path.abspath("../../../inference/prompt/test.json") +output_folder = os.path.abspath("../../../inference/output") + # Generate incremental decoding configs all_models = llama_models + opt_models + falcon_models + mpt_models diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py index f5ed8ae65b..a8a9be2f3b 100644 --- a/tests/peft/alignment/align_test_utils.py +++ b/tests/peft/alignment/align_test_utils.py @@ -430,7 +430,7 @@ def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2): print(f"HF: {hf_tensor}\nFF:{ff_tensor}") print(np.isclose(hf_tensor, ff_tensor, atol=tolerance)) mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0] - print(mismatches) + # print(mismatches) len_hf_tensor = hf_tensor.flatten().shape[0] assert len(mismatches) <= 0.05 * len_hf_tensor print("Ok!") diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py index a2fc5548ab..8a53ef8c9c 100644 --- a/tests/peft/hf_finetune.py +++ b/tests/peft/hf_finetune.py @@ -77,7 +77,7 @@ def main(): if args.save_peft_tensors: make_debug_dirs() register_peft_hooks(model) - save_model_weights(model, target_modules=["lora", "lm_head", "down_proj"]) + save_model_weights(model, target_modules=["lora", "lm_head", "down_proj", "up_proj"]) # Load fine-tuning dataset data = load_dataset("Abirate/english_quotes") diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index cc677cd51a..c4db87c099 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -17,7 +17,7 @@ def check_bwd_pass(self): def check_step(self, step_idx, learning_rate=0.001): raise NotImplementedError() -class LllamaAlignmentTest(AlignmentTest): +class LlamaAlignmentTest(AlignmentTest): def __init__(self, model_name, tp_degree=1): self.model_name = model_name self.peft_config = PeftConfig.from_pretrained(model_name) @@ -485,12 +485,16 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output") + down_proj_grad_output_pre = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE, pre=True) + down_proj_grad_output = ff_tensor.clone() + compare_loaded_tensors(down_proj_grad_output, down_proj_grad_output_pre) # LoRA_B hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default" ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name) output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) + lora_grad_output = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output") @@ -501,6 +505,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input") + lora_a_grad_input = ff_tensor.clone() # W2 (down_proj) input hf_tensor_name = f"layers.{i}.mlp.down_proj" @@ -508,7 +513,15 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + down_proj_grad_input_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, pre=True) compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input") + + # down proj output (before/after kernel) should match output of lora_b + compare_loaded_tensors(down_proj_grad_output, lora_grad_output) + # down proj input (before kernel) should match input of lora_a + compare_loaded_tensors(down_proj_grad_input_pre, lora_a_grad_input) + # compare_loaded_tensors(down_proj_grad_input_pre.squeeze(), ff_tensor.squeeze()) + # W2 input (HF) and SigmoidSiluMulti output (FF) hf_w2_input = hf_tensor.clone() @@ -538,11 +551,47 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION) + # print(f"w3 {i} grad output") + # print("flexflow tensor shape:", ff_tensor.squeeze().shape) + # print(ff_tensor.squeeze()) + # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) + # print(hf_tensor.squeeze().T) compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output") + # print(f"W3 {i} output matches!") + # print(f"FF shape: {ff_tensor.shape}") + # print(f"HF shape: {hf_tensor.shape}") + + # hf_w3_output = hf_tensor.clone() + + # W3 (up_proj) input input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison) ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + + # w3_input_torch = torch.matmul(hf_tensor, torch.transpose(ff_tensor, 0, 1)) + # ff_up_proj_weight_path="/usr/.cache/flexflow/debug/flexflow/weights/step_0/shard_0/layers.11.layers.11.mlp.up_proj.weight_0" + # hf_up_proj_weight_path="/usr/.cache/flexflow/debug/huggingface/weights/step_0/layers.11.mlp.up_proj.weight" + # hf_up_proj_weight = torch.load(hf_up_proj_weight_path, map_location='cpu') + # print(hf_up_proj_weight.shape) + # ff_up_proj_weight = load_ff_tensor(ff_up_proj_weight_path, hf_up_proj_weight.shape[::-1]) + # print(ff_up_proj_weight.shape) + # ff_up_proj_weight = torch.from_numpy(ff_up_proj_weight).to(hf_up_proj_weight.dtype) + # assert torch.allclose(hf_up_proj_weight.T, ff_up_proj_weight, atol=1e-5) + + # print("HF W3 output shape:", hf_w3_output.shape) + # print("HF W3 weight shape:", hf_up_proj_weight.shape) + # print("HF W3 input shape:", hf_tensor.shape) + + # simulated_w3_input = torch.matmul(hf_w3_output.squeeze(), hf_up_proj_weight) + # print("simulated W3 input shape:", simulated_w3_input.T.shape) + # print(simulated_w3_input.T) + # print(f"w3 {i} grad input") + # print("flexflow tensor shape:", ff_tensor.squeeze().shape) + # print(ff_tensor.squeeze()) + # print("huggingface tensor shape:", hf_tensor.squeeze().T.shape) + # print(hf_tensor.squeeze().T) + compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input") # Attn O-proj @@ -606,7 +655,8 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm" _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1) input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) - torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5) + compare_loaded_tensors(attn_input, input_layernorm_out1, tolerance=1e-5) + # torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5) # Input layernorm @@ -695,7 +745,24 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4): torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5) ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name) ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE) + + lora_low_rank_activation_fwd_path = f"/usr/.cache/flexflow/debug/flexflow/fwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation" + lora_low_rank_activation_bwd_path = f"/usr/.cache/flexflow/debug/flexflow/bwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation" + lora_low_rank_activation_fwd = load_ff_tensor(lora_low_rank_activation_fwd_path, [16, 128])[:,:self.num_tokens] + lora_low_rank_activation_fwd = torch.from_numpy(lora_low_rank_activation_fwd) + lora_low_rank_activation_bwd = load_ff_tensor(lora_low_rank_activation_bwd_path, [16, 24]) + lora_low_rank_activation_bwd = torch.from_numpy(lora_low_rank_activation_bwd) + torch.testing.assert_close(lora_low_rank_activation_fwd, lora_low_rank_activation_bwd, rtol=1.3e-6, atol=1e-5) + + # print(f"LoRA_B {i} gradient") + # print("FlexFlow shape: ", ff_gradient.shape) + # print(ff_gradient) + # print("HuggingFace shape: ", hf_gradient.shape) + # print(hf_gradient.squeeze().T) compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient") + + + # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0" # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0") # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0") @@ -737,7 +804,7 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4): args = parser.parse_args() if __name__ == "__main__": - llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree) + llama_alignment = LlamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree) # llama_alignment.check_weights_alignment() for i in range(args.num_steps): llama_alignment.check_fwd_pass(i) diff --git a/tests/peft_test.sh b/tests/peft_test.sh index 5600d57edf..e497d4224e 100755 --- a/tests/peft_test.sh +++ b/tests/peft_test.sh @@ -31,22 +31,22 @@ mkdir -p ./inference/output export LEGION_BACKTRACE=1 # Download test model -python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m +python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora # Run PEFT in Huggingface to get ground truth tensors -python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision +python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 0.001 # Python test echo "Python test" python ./inference/python/ff_peft.py # Check alignment -python ./tests/peft/peft_alignment_test.py -tp 2 +python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001 # C++ test echo "C++ test" ./build/inference/peft/peft \ - -ll:gpu 2 -ll:cpu 4 -ll:util 4 \ - -tensor-parallelism-degree 2 \ + -ll:gpu 4 -ll:cpu 4 -ll:util 4 \ + -tensor-parallelism-degree 4 \ -ll:fsize 8192 -ll:zsize 12000 \ -llm-model JackFram/llama-160m \ -finetuning-dataset ./inference/prompt/peft_dataset.json \ @@ -55,7 +55,7 @@ echo "C++ test" --use-full-precision \ --inference-debugging # Check alignment -python ./tests/peft/peft_alignment_test.py -tp 2 +python ./tests/peft/peft_alignment_test.py -tp 4 -lr 0.001 # Print succeess message echo "" From 78307b0e8beb5d41ee003be8b5db168c2b3ef4e2 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 26 Nov 2024 19:13:07 +0000 Subject: [PATCH 43/44] update --- docker/run.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docker/run.sh b/docker/run.sh index 759da521aa..62d7468a00 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -127,8 +127,7 @@ fi ssh_key_volume="" ssh_key_path="$HOME/.ssh/id_rsa" -if [ -f "$ssh_key_path" ]; then - # If the token exists, add the volume mount to the Docker command - ssh_key_volume+="-v $ssh_key_path:/root/.ssh/id_rsa" +if [ -f "$ssh_key_path" ] && [ -f "$ssh_key_path.pub" ]; then + ssh_key_volume="-v $ssh_key_path:/root/.ssh/id_rsa -v $ssh_key_path.pub:/root/.ssh/id_rsa.pub" fi eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "--cap-add=SYS_PTRACE" "${ssh_key_volume}" "${hf_token_volume}" "${port_forward_arg}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest" From 518543808b6cd0564e0537601f9d326023d4fe5c Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Thu, 28 Nov 2024 21:41:35 +0000 Subject: [PATCH 44/44] fix file loader --- include/flexflow/utils/file_loader.h | 31 +++++-- src/runtime/file_loader.cc | 129 +++++++++++++++++++-------- 2 files changed, 118 insertions(+), 42 deletions(-) diff --git a/include/flexflow/utils/file_loader.h b/include/flexflow/utils/file_loader.h index 8735f23571..8ad0f1d14e 100644 --- a/include/flexflow/utils/file_loader.h +++ b/include/flexflow/utils/file_loader.h @@ -21,6 +21,7 @@ using namespace std; using namespace FlexFlow; +using namespace Legion; class FileDataLoader { public: @@ -36,16 +37,31 @@ class FileDataLoader { BatchConfig::TokenId *generate_requests(int num, int length); template - void load_single_weight_tensor(FFModel *ff, Layer *l, int weight_idx); + void load_single_weight_tensor(FFModel *ff, + Layer *l, + int weight_idx, + size_t volume, + size_t num_replicas, + DT *weight, + Domain weight_domain); - void load_quantization_weight(FFModel *ff, Layer *l, int weight_idx); + void load_quantization_weight(FFModel *ff, + Layer *l, + int weight_idx, + size_t volume, + size_t num_replicas, + char *weight, + DataType data_type, + Domain weight_domain); static void load_weight_task(Legion::Task const *task, std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); - void load_weights_parallel(FFModel *ff, Context ctx, Runtime *runtime); + void load_weights_parallel(FFModel *ff, + Legion::Context ctx, + Legion::Runtime *runtime); void load_positions(FFModel *ff, Tensor pt, @@ -66,12 +82,15 @@ struct WeightLoadTaskArgs { FileDataLoader *loader; Layer *layer; int weight_idx; + size_t volume, num_replicas; DataType data_type; WeightLoadTaskArgs(FFModel *_ff, FileDataLoader *_loader, Layer *_l, int _idx, + size_t _volume, + size_t _num_replicas, DataType _data_type) - : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), - data_type(_data_type) {} -}; + : ff(_ff), loader(_loader), layer(_l), weight_idx(_idx), volume(_volume), + num_replicas(_num_replicas), data_type(_data_type) {} +}; \ No newline at end of file diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc index 3ebe6cf095..6ffa9370f0 100644 --- a/src/runtime/file_loader.cc +++ b/src/runtime/file_loader.cc @@ -670,14 +670,20 @@ void load_from_quantized_file(char *ptr, void FileDataLoader::load_quantization_weight(FFModel *ff, Layer *l, - int weight_idx) { - Tensor weight = l->weights[weight_idx]; - size_t volume = 1; + int weight_idx, + size_t volume, + size_t num_replicas, + char *weight, + DataType data_type, + Domain weight_domain) { + size_t volume_ = 1; std::vector dims_vec; - for (int i = 0; i < weight->num_dims; i++) { - dims_vec.push_back(weight->dims[i]); - volume *= weight->dims[i]; + for (int i = 0; i < weight_domain.get_dim(); i++) { + int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1; + dims_vec.push_back(dim_i); + volume_ *= dim_i; } + assert(volume_ == volume * num_replicas); char *data = (char *)malloc(sizeof(char) * volume); std::string weight_filename = removeGuidOperatorName(std::string(l->name)); @@ -692,7 +698,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, qkv_inner_dim, weight_filename, weights_folder, - weight->data_type, + data_type, use_full_precision); } // else { @@ -714,31 +720,38 @@ void FileDataLoader::load_quantization_weight(FFModel *ff, load_from_quantized_file(data, volume, join_path({weights_folder, weight_filename}), - weight->data_type, + data_type, use_full_precision); } - ParallelTensor weight_pt; - ff->get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor(ff, dims_vec, data); + char *ptr = weight; + for (size_t i = 0; i < num_replicas; i++) { + memcpy(ptr, data, volume * sizeof(char)); + ptr += volume; + } - delete data; + free(data); } template void FileDataLoader::load_single_weight_tensor(FFModel *ff, Layer *l, - int weight_idx) { - Tensor weight = l->weights[weight_idx]; + int weight_idx, + size_t volume, + size_t num_replicas, + DT *weight, + Domain weight_domain) { // Create a buffer to store weight data from the file - size_t volume = 1; + size_t volume_ = 1; std::vector dims_vec; - for (int i = 0; i < weight->num_dims; i++) { - dims_vec.push_back(weight->dims[i]); - volume *= weight->dims[i]; + for (int i = 0; i < weight_domain.get_dim(); i++) { + int dim_i = weight_domain.hi()[i] - weight_domain.lo()[i] + 1; + dims_vec.push_back(dim_i); + volume_ *= dim_i; } - assert(data_type_size(weight->data_type) == sizeof(DT)); + assert(volume_ == volume * num_replicas); + // assert(data_type_size(weight->data_type) == sizeof(DT)); DT *data = (DT *)malloc(sizeof(DT) * volume); std::string weight_filename = removeGuidOperatorName(std::string(l->name)); @@ -843,13 +856,15 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff, } } - // Copy the weight data from the buffer to the weight's ParallelTensor - ParallelTensor weight_pt; - ff->get_parallel_tensor_from_tensor(weight, weight_pt); - weight_pt->set_tensor
(ff, dims_vec, data); + // Copy the weight data from the buffer to the weight + DT *ptr = weight; + for (size_t i = 0; i < num_replicas; i++) { + memcpy(ptr, data, volume * sizeof(DT)); + ptr += volume; + } // Free buffer memory - delete data; + free(data); } void FileDataLoader::load_weight_task( @@ -859,21 +874,44 @@ void FileDataLoader::load_weight_task( Legion::Runtime *runtime) { WeightLoadTaskArgs const *args = (WeightLoadTaskArgs const *)task->args; + assert(task->regions.size() == regions.size()); + assert(regions.size() == 1); // one weight only + GenericTensorAccessorW weight = helperGetGenericTensorAccessorWO( + args->data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime); + Domain weight_domain = runtime->get_index_space_domain( + ctx, task->regions[0].region.get_index_space()); + switch (args->data_type) { case DT_HALF: { - args->loader->load_single_weight_tensor( - args->ff, args->layer, args->weight_idx); + args->loader->load_single_weight_tensor(args->ff, + args->layer, + args->weight_idx, + args->volume, + args->num_replicas, + weight.get_half_ptr(), + weight_domain); break; } case DT_FLOAT: { - args->loader->load_single_weight_tensor( - args->ff, args->layer, args->weight_idx); + args->loader->load_single_weight_tensor(args->ff, + args->layer, + args->weight_idx, + args->volume, + args->num_replicas, + weight.get_float_ptr(), + weight_domain); break; } case DT_INT4: case DT_INT8: { - args->loader->load_quantization_weight( - args->ff, args->layer, args->weight_idx); + args->loader->load_quantization_weight(args->ff, + args->layer, + args->weight_idx, + args->volume, + args->num_replicas, + weight.get_byte_ptr(), + args->data_type, + weight_domain); break; } default: @@ -897,19 +935,38 @@ void FileDataLoader::load_weights_parallel(FFModel *ff, continue; } - if (l->op_type == OP_LORA) { - continue; - } - if (weight->data_type != DT_FLOAT && weight->data_type != DT_HALF && weight->data_type != DT_INT4 && weight->data_type != DT_INT8) { assert(false && "Unsupported data type"); } + ParallelTensor weight_pt; + ff->get_parallel_tensor_from_tensor(weight, weight_pt); + // Create task arguments - WeightLoadTaskArgs args(ff, this, l, i, weight->data_type); + size_t volume = 1, num_replicas = 1; + if (weight_pt->sync_type == ParameterSyncType::NCCL) { + for (int i = 0; i < weight_pt->num_dims; i++) { + if (weight_pt->dims[i].is_replica_dim) { + num_replicas *= weight_pt->dims[i].size; + } + } + } else if (weight_pt->sync_type == ParameterSyncType::PS) { + num_replicas = 1; + } else { + num_replicas = 1; + } + for (int i = 0; i < weight->num_dims; i++) { + volume *= weight->dims[i]; + } + WeightLoadTaskArgs args( + ff, this, l, i, volume, num_replicas, weight->data_type); + // launch task asynchronously TaskLauncher launcher(LOAD_WEIGHT_TASK_ID, TaskArgument(&args, sizeof(WeightLoadTaskArgs))); + launcher.add_region_requirement(RegionRequirement( + weight_pt->region, WRITE_ONLY, EXCLUSIVE, weight_pt->region)); + launcher.add_field(0, FID_DATA); futures.push_back(runtime->execute_task(ctx, launcher)); } } @@ -918,4 +975,4 @@ void FileDataLoader::load_weights_parallel(FFModel *ff, for (Future &f : futures) { f.get_void_result(); } -} +} \ No newline at end of file