From 093b29d3ed0e159bf1702de024370ee24b92a755 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Tue, 2 Jan 2024 18:50:49 -0500 Subject: [PATCH 01/12] bug fixes and update Legion version --- deps/legion | 2 +- src/ops/linear.cc | 8 ++------ src/runtime/model.cu | 1 - src/runtime/request_manager.cc | 7 ++----- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/deps/legion b/deps/legion index 626b55689c..d065278678 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit d0652786784249e933dd62f675591da99a5e960d diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 63b26bfe7d..93e93a5953 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -467,12 +467,8 @@ OpMeta *Linear::init_task_with_dim(Task const *task, ctx, runtime, false /*readOutput*/); - TensorAccessorW acc_kernel(regions[2], - task->regions[2], - FID_DATA, - ctx, - runtime, - false /*readOutput*/); + TensorAccessorR acc_kernel( + regions[2], task->regions[2], FID_DATA, ctx, runtime); // TensorAccessorR acc_bias( // regions[3], task->regions[3], FID_DATA, ctx, runtime); diff --git a/src/runtime/model.cu b/src/runtime/model.cu index c885b29db2..23b7f0efbe 100644 --- a/src/runtime/model.cu +++ b/src/runtime/model.cu @@ -175,7 +175,6 @@ FFHandler } else { handle.batch_config_metadata = nullptr; } - // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize)); #ifdef FF_USE_NCCL diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 88754f5a82..a285932b7f 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -1188,10 +1188,7 @@ BeamSearchBatchConfig int ssm_decoding_steps = profiling_requests[request.guid].ssm_decoding_steps; - new_bc.beamRequestsInfo[i].beam_size = - spec_infer_tree_width.size() > ssm_decoding_steps - ? spec_infer_tree_width[ssm_decoding_steps] - : 1; + new_bc.beamRequestsInfo[i].beam_size = 1; // printf("beam size: %d, %d\n", // new_bc.beamRequestsInfo[i].beam_size, // ssm_decoding_steps); @@ -1820,7 +1817,7 @@ void RequestManager::updateBitMask(BatchConfig::BitMask &bitmask, void RequestManager::appendPendingRequest(BatchConfig::BitMask &bitmask, int initLength) { assert(initLength > 0); - std::cout << "append pending bit mask: " << initLength << "\n"; + // std::cout << "append pending bit mask: " << initLength << "\n"; // eg. 4 tokens: t1: 0000000..1111, t2: 0000000..1110, t3: 0000000..1100, t4: // 0000000..1000 bitmask.non_tree_cache_size = 0; From c8d2cd19110cb1e6eba8d5554918e7f9762aba50 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 Jan 2024 21:35:00 +0000 Subject: [PATCH 02/12] fix --- CMakeLists.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 90cab126e6..3fbd06c74e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,6 +413,7 @@ if(NOT BUILD_LEGION_ONLY) # python related if (FF_USE_PYTHON) + find_package(Python COMPONENTS Interpreter Development) # create flexflow_cffi_header.py add_custom_command(TARGET flexflow PRE_BUILD @@ -424,13 +425,13 @@ if(NOT BUILD_LEGION_ONLY) # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library add_custom_command(TARGET flexflow POST_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python ) # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead. add_custom_command(TARGET flexflow PRE_BUILD - COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} + COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Creating flexflow_python interpreter..." ) @@ -567,7 +568,8 @@ if(NOT BUILD_LEGION_ONLY) install(TARGETS flexflow DESTINATION ${LIB_DEST}) # install python if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + find_package(Python COMPONENTS Interpreter Development) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if (NOT FF_BUILD_FROM_PYPI) install( DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/ From 522f1c1ec422d6f433bfbafcdd14b3b39625ac02 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Tue, 9 Jan 2024 22:19:18 +0000 Subject: [PATCH 03/12] bug fix --- cmake/pip_install/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt index 7ce38c4abc..105133a310 100644 --- a/cmake/pip_install/CMakeLists.txt +++ b/cmake/pip_install/CMakeLists.txt @@ -1,10 +1,10 @@ # Use setup.py script to re-install the Python bindings library with the right library paths if (FF_USE_PYTHON) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE) if(FF_BUILD_FROM_PYPI) install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${PY_DEST}/flexflow/lib \")") # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion - install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") + install(CODE "execute_process(COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/setup.py install --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${PY_DEST}/flexflow ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python)") endif() endif() From 0630f6df502e87151340acc7f6a25a1d3a5c1ad0 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Fri, 19 Jan 2024 23:47:46 +0000 Subject: [PATCH 04/12] update legion --- deps/legion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/legion b/deps/legion index 626b55689c..d065278678 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit 626b55689c77848b246e1da19678c7ad58899f0c +Subproject commit d0652786784249e933dd62f675591da99a5e960d From 45a1b783b360bc7cc35cfdf2868e33fe5a9ad031 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 20 Jan 2024 01:04:44 +0000 Subject: [PATCH 05/12] fix arithmetic error due to num_devices uninitialized --- include/flexflow/request_manager.h | 1 - src/runtime/inference_manager.cc | 30 +----------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 50a51705cd..4763eb1ef3 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -55,7 +55,6 @@ class InferenceManager { public: std::unordered_map> tensor_buffer; std::unordered_map model_weights_loaders; - int num_devices; }; struct Request { diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc index 6588cbceeb..2a94df8b4d 100644 --- a/src/runtime/inference_manager.cc +++ b/src/runtime/inference_manager.cc @@ -28,33 +28,7 @@ using namespace Legion; LegionRuntime::Logger::Category log_inf_mgr("InferenceManager"); LegionRuntime::Logger::Category log_offload("Offloading"); -InferenceManager::InferenceManager() { -#ifdef DEADCODE - num_devices = ff_config.workersPerNode * ff_config.numNodes; - // Check parallelization degrees - assert(ff_config.data_parallelism_degree <= num_devices && - "Data parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.data_parallelism_degree == 0 && - "Number of available devices is not divisible by data parallelism " - "degree"); - assert(ff_config.tensor_parallelism_degree <= num_devices && - "Tensor parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.tensor_parallelism_degree == 0 && - "Number of available devices is not divisible by tensor parallelism " - "degree"); - assert(ff_config.pipeline_parallelism_degree <= num_devices && - "Pipeline parallelism degree exceeds number of available devices"); - assert(num_devices % ff_config.pipeline_parallelism_degree == 0 && - "Number of available devices is not divisible by pipeline parallelism " - "degree"); - assert(ff_config.data_parallelism_degree * - ff_config.tensor_parallelism_degree * - ff_config.pipeline_parallelism_degree == - num_devices && - "Product of data, tensor, and pipeline parallelism degrees does not " - "match the number of available devices"); -#endif -} +InferenceManager::InferenceManager() {} InferenceManager *inference_manager_singleton = nullptr; @@ -296,8 +270,6 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) { void InferenceManager::init_operators_inference(FFModel *model) { for (int batch_index = 0; batch_index < model->config.data_parallelism_degree; batch_index++) { - int expert_device_index = 0; - int device_index = batch_index % num_devices; for (size_t o = 0; o < model->operators.size(); o++) { Op *op = model->operators[o]; if (op->op_type == OP_WEIGHT) { From 1646b4388efc264ba7843d8f9b2e9f354f0ba462 Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Sat, 20 Jan 2024 11:08:30 -0500 Subject: [PATCH 06/12] update legion version --- deps/legion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/legion b/deps/legion index d065278678..ef6c499753 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit d0652786784249e933dd62f675591da99a5e960d +Subproject commit ef6c499753cebb7bb44ac52c109144a9e6a1c577 From 8e7a7d01f4db35ef6e73b3cf76417fedce6f05ef Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 20 Jan 2024 18:23:47 -0500 Subject: [PATCH 07/12] update ci --- .github/workflows/gpu-ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 3901d6b5f7..9cd6b82087 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -243,7 +243,7 @@ jobs: - name: Build and Install FlexFlow run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) export FF_BUILD_ALL_EXAMPLES=ON export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON @@ -252,18 +252,18 @@ jobs: - name: Check FlexFlow Python interface (pip) run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib ./tests/python_interface_test.sh after-installation - name: Run multi-gpu tests run: | - export PATH=/opt/conda/bin:$PATH + export PATH=$CONDA_PREFIX/bin:$PATH export CUDNN_DIR=/usr/local/cuda export CUDA_DIR=/usr/local/cuda export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests From f9932aeb6f191c29a2d4ba7dbb056eb059d43adc Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 20 Jan 2024 18:41:35 -0500 Subject: [PATCH 08/12] fix --- .github/workflows/gpu-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 9cd6b82087..48dcda157e 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -222,7 +222,7 @@ jobs: CONDA: "3" needs: inference-tests container: - image: ghcr.io/flexflow/flexflow-environment-cuda:latest + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest options: --gpus all --shm-size=8192m steps: - name: Install updated git version From 0b3148ef6adfcb64935e6b1e83a88494910a7b22 Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 20 Jan 2024 18:42:08 -0500 Subject: [PATCH 09/12] debugging ci --- .github/workflows/gpu-ci.yml | 264 +++++++++++++++++------------------ 1 file changed, 132 insertions(+), 132 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 48dcda157e..eb0046617b 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -60,155 +60,155 @@ jobs: pip3 install pygithub python3 .github/workflows/helpers/gpu_ci_helper.py - python-interface-check: - name: Check Python Interface - runs-on: [self-hosted, gpu] - defaults: - run: - shell: bash -l {0} # required to use an activated conda environment - env: - CONDA: "3" - needs: gpu-ci-concierge - container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest - options: --gpus all --shm-size=8192m - steps: - - name: Install updated git version - run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git + # python-interface-check: + # name: Check Python Interface + # runs-on: [self-hosted, gpu] + # defaults: + # run: + # shell: bash -l {0} # required to use an activated conda environment + # env: + # CONDA: "3" + # needs: gpu-ci-concierge + # container: + # image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + # options: --gpus all --shm-size=8192m + # steps: + # - name: Install updated git version + # run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive + # - name: Checkout Git Repository + # uses: actions/checkout@v3 + # with: + # submodules: recursive - - name: Install conda and FlexFlow dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - miniconda-version: "latest" - activate-environment: flexflow - environment-file: conda/flexflow.yml - auto-activate-base: false - auto-update-conda: false + # - name: Install conda and FlexFlow dependencies + # uses: conda-incubator/setup-miniconda@v2 + # with: + # miniconda-version: "latest" + # activate-environment: flexflow + # environment-file: conda/flexflow.yml + # auto-activate-base: false + # auto-update-conda: false - - name: Install conda and Pytorch dependencies for pytorch alignment test - run: | - conda env create -f conda/pytorch-gpu.yml + # - name: Install conda and Pytorch dependencies for pytorch alignment test + # run: | + # conda env create -f conda/pytorch-gpu.yml - - name: Build FlexFlow - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - mkdir build - cd build - ../config/config.linux - make -j + # - name: Build FlexFlow + # run: | + # export PATH=$CONDA_PREFIX/bin:$PATH + # export FF_HOME=$(pwd) + # export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion + # mkdir build + # cd build + # ../config/config.linux + # make -j - - name: Check FlexFlow Python interface (before installation) - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - ./tests/python_interface_test.sh before-installation + # - name: Check FlexFlow Python interface (before installation) + # run: | + # export PATH=$CONDA_PREFIX/bin:$PATH + # export FF_HOME=$(pwd) + # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + # ./tests/python_interface_test.sh before-installation - - name: Install FlexFlow - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - cd build - ../config/config.linux - make install - ldconfig + # - name: Install FlexFlow + # run: | + # export PATH=$CONDA_PREFIX/bin:$PATH + # export FF_HOME=$(pwd) + # export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion + # cd build + # ../config/config.linux + # make install + # ldconfig - - name: Check FlexFlow Python interface (after installation) - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - ./tests/python_interface_test.sh after-installation + # - name: Check FlexFlow Python interface (after installation) + # run: | + # export PATH=$CONDA_PREFIX/bin:$PATH + # export FF_HOME=$(pwd) + # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + # ./tests/python_interface_test.sh after-installation - - name: Run flexflow alignment with pytorch - run: | - # run alingment tests - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - ./tests/align/test_all_operators.sh + # - name: Run flexflow alignment with pytorch + # run: | + # # run alingment tests + # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + # ./tests/align/test_all_operators.sh - inference-tests: - name: Inference Tests - runs-on: [self-hosted, gpu] - defaults: - run: - shell: bash -l {0} # required to use an activated conda environment - env: - CONDA: "3" - HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} - needs: gpu-ci-concierge - container: - image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest - options: --gpus all --shm-size=8192m - steps: - - name: Install updated git version - run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git + # inference-tests: + # name: Inference Tests + # runs-on: [self-hosted, gpu] + # defaults: + # run: + # shell: bash -l {0} # required to use an activated conda environment + # env: + # CONDA: "3" + # HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} + # needs: gpu-ci-concierge + # container: + # image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + # options: --gpus all --shm-size=8192m + # steps: + # - name: Install updated git version + # run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - - name: Checkout Git Repository - uses: actions/checkout@v3 - with: - submodules: recursive + # - name: Checkout Git Repository + # uses: actions/checkout@v3 + # with: + # submodules: recursive - - name: Install conda and FlexFlow dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - miniconda-version: "latest" - activate-environment: flexflow - environment-file: conda/flexflow.yml - auto-activate-base: false + # - name: Install conda and FlexFlow dependencies + # uses: conda-incubator/setup-miniconda@v2 + # with: + # miniconda-version: "latest" + # activate-environment: flexflow + # environment-file: conda/flexflow.yml + # auto-activate-base: false - - name: Build FlexFlow - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON - mkdir build - cd build - ../config/config.linux - make -j + # - name: Build FlexFlow + # run: | + # export PATH=$CONDA_PREFIX/bin:$PATH + # export FF_HOME=$(pwd) + # export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion + # export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + # mkdir build + # cd build + # ../config/config.linux + # make -j - - name: Run inference tests - env: - CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} - run: | - export PATH=$CONDA_PREFIX/bin:$PATH - export FF_HOME=$(pwd) - export CUDNN_DIR=/usr/local/cuda - export CUDA_DIR=/usr/local/cuda - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + # - name: Run inference tests + # env: + # CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} + # run: | + # export PATH=$CONDA_PREFIX/bin:$PATH + # export FF_HOME=$(pwd) + # export CUDNN_DIR=/usr/local/cuda + # export CUDA_DIR=/usr/local/cuda + # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - # GPT tokenizer test - # ./tests/gpt_tokenizer_test.sh + # # GPT tokenizer test + # # ./tests/gpt_tokenizer_test.sh - # Inference tests - source ./build/set_python_envs.sh - ./tests/inference_tests.sh + # # Inference tests + # source ./build/set_python_envs.sh + # ./tests/inference_tests.sh - - name: Save inference output as an artifact - if: always() - run: | - cd inference - tar -zcvf output.tar.gz ./output + # - name: Save inference output as an artifact + # if: always() + # run: | + # cd inference + # tar -zcvf output.tar.gz ./output - - name: Upload artifact - uses: actions/upload-artifact@v3 - if: always() - with: - name: output - path: inference/output.tar.gz + # - name: Upload artifact + # uses: actions/upload-artifact@v3 + # if: always() + # with: + # name: output + # path: inference/output.tar.gz - # Github persists the .cache folder across different runs/containers - - name: Clear cache - if: always() - run: sudo rm -rf ~/.cache + # # Github persists the .cache folder across different runs/containers + # - name: Clear cache + # if: always() + # run: sudo rm -rf ~/.cache training-tests: name: Training Tests @@ -220,7 +220,7 @@ jobs: shell: bash -l {0} # required to use an activated conda environment env: CONDA: "3" - needs: inference-tests + # needs: inference-tests container: image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest options: --gpus all --shm-size=8192m From d1c541f24a3466afdf6510e12e8399343cf47baa Mon Sep 17 00:00:00 2001 From: Gabriele Oliaro Date: Sat, 20 Jan 2024 18:53:35 -0500 Subject: [PATCH 10/12] Revert "debugging ci" This reverts commit 0b3148ef6adfcb64935e6b1e83a88494910a7b22. --- .github/workflows/gpu-ci.yml | 264 +++++++++++++++++------------------ 1 file changed, 132 insertions(+), 132 deletions(-) diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index eb0046617b..48dcda157e 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -60,155 +60,155 @@ jobs: pip3 install pygithub python3 .github/workflows/helpers/gpu_ci_helper.py - # python-interface-check: - # name: Check Python Interface - # runs-on: [self-hosted, gpu] - # defaults: - # run: - # shell: bash -l {0} # required to use an activated conda environment - # env: - # CONDA: "3" - # needs: gpu-ci-concierge - # container: - # image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest - # options: --gpus all --shm-size=8192m - # steps: - # - name: Install updated git version - # run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git + python-interface-check: + name: Check Python Interface + runs-on: [self-hosted, gpu] + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" + needs: gpu-ci-concierge + container: + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + options: --gpus all --shm-size=8192m + steps: + - name: Install updated git version + run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - # - name: Checkout Git Repository - # uses: actions/checkout@v3 - # with: - # submodules: recursive + - name: Checkout Git Repository + uses: actions/checkout@v3 + with: + submodules: recursive - # - name: Install conda and FlexFlow dependencies - # uses: conda-incubator/setup-miniconda@v2 - # with: - # miniconda-version: "latest" - # activate-environment: flexflow - # environment-file: conda/flexflow.yml - # auto-activate-base: false - # auto-update-conda: false + - name: Install conda and FlexFlow dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + activate-environment: flexflow + environment-file: conda/flexflow.yml + auto-activate-base: false + auto-update-conda: false - # - name: Install conda and Pytorch dependencies for pytorch alignment test - # run: | - # conda env create -f conda/pytorch-gpu.yml + - name: Install conda and Pytorch dependencies for pytorch alignment test + run: | + conda env create -f conda/pytorch-gpu.yml - # - name: Build FlexFlow - # run: | - # export PATH=$CONDA_PREFIX/bin:$PATH - # export FF_HOME=$(pwd) - # export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - # mkdir build - # cd build - # ../config/config.linux - # make -j + - name: Build FlexFlow + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion + mkdir build + cd build + ../config/config.linux + make -j - # - name: Check FlexFlow Python interface (before installation) - # run: | - # export PATH=$CONDA_PREFIX/bin:$PATH - # export FF_HOME=$(pwd) - # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - # ./tests/python_interface_test.sh before-installation + - name: Check FlexFlow Python interface (before installation) + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + ./tests/python_interface_test.sh before-installation - # - name: Install FlexFlow - # run: | - # export PATH=$CONDA_PREFIX/bin:$PATH - # export FF_HOME=$(pwd) - # export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - # cd build - # ../config/config.linux - # make install - # ldconfig + - name: Install FlexFlow + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion + cd build + ../config/config.linux + make install + ldconfig - # - name: Check FlexFlow Python interface (after installation) - # run: | - # export PATH=$CONDA_PREFIX/bin:$PATH - # export FF_HOME=$(pwd) - # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - # ./tests/python_interface_test.sh after-installation + - name: Check FlexFlow Python interface (after installation) + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + ./tests/python_interface_test.sh after-installation - # - name: Run flexflow alignment with pytorch - # run: | - # # run alingment tests - # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - # ./tests/align/test_all_operators.sh + - name: Run flexflow alignment with pytorch + run: | + # run alingment tests + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + ./tests/align/test_all_operators.sh - # inference-tests: - # name: Inference Tests - # runs-on: [self-hosted, gpu] - # defaults: - # run: - # shell: bash -l {0} # required to use an activated conda environment - # env: - # CONDA: "3" - # HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} - # needs: gpu-ci-concierge - # container: - # image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest - # options: --gpus all --shm-size=8192m - # steps: - # - name: Install updated git version - # run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git + inference-tests: + name: Inference Tests + runs-on: [self-hosted, gpu] + defaults: + run: + shell: bash -l {0} # required to use an activated conda environment + env: + CONDA: "3" + HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} + needs: gpu-ci-concierge + container: + image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest + options: --gpus all --shm-size=8192m + steps: + - name: Install updated git version + run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git - # - name: Checkout Git Repository - # uses: actions/checkout@v3 - # with: - # submodules: recursive + - name: Checkout Git Repository + uses: actions/checkout@v3 + with: + submodules: recursive - # - name: Install conda and FlexFlow dependencies - # uses: conda-incubator/setup-miniconda@v2 - # with: - # miniconda-version: "latest" - # activate-environment: flexflow - # environment-file: conda/flexflow.yml - # auto-activate-base: false + - name: Install conda and FlexFlow dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + activate-environment: flexflow + environment-file: conda/flexflow.yml + auto-activate-base: false - # - name: Build FlexFlow - # run: | - # export PATH=$CONDA_PREFIX/bin:$PATH - # export FF_HOME=$(pwd) - # export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion - # export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON - # mkdir build - # cd build - # ../config/config.linux - # make -j + - name: Build FlexFlow + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion + export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON + mkdir build + cd build + ../config/config.linux + make -j - # - name: Run inference tests - # env: - # CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} - # run: | - # export PATH=$CONDA_PREFIX/bin:$PATH - # export FF_HOME=$(pwd) - # export CUDNN_DIR=/usr/local/cuda - # export CUDA_DIR=/usr/local/cuda - # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib + - name: Run inference tests + env: + CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} + run: | + export PATH=$CONDA_PREFIX/bin:$PATH + export FF_HOME=$(pwd) + export CUDNN_DIR=/usr/local/cuda + export CUDA_DIR=/usr/local/cuda + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib - # # GPT tokenizer test - # # ./tests/gpt_tokenizer_test.sh + # GPT tokenizer test + # ./tests/gpt_tokenizer_test.sh - # # Inference tests - # source ./build/set_python_envs.sh - # ./tests/inference_tests.sh + # Inference tests + source ./build/set_python_envs.sh + ./tests/inference_tests.sh - # - name: Save inference output as an artifact - # if: always() - # run: | - # cd inference - # tar -zcvf output.tar.gz ./output + - name: Save inference output as an artifact + if: always() + run: | + cd inference + tar -zcvf output.tar.gz ./output - # - name: Upload artifact - # uses: actions/upload-artifact@v3 - # if: always() - # with: - # name: output - # path: inference/output.tar.gz + - name: Upload artifact + uses: actions/upload-artifact@v3 + if: always() + with: + name: output + path: inference/output.tar.gz - # # Github persists the .cache folder across different runs/containers - # - name: Clear cache - # if: always() - # run: sudo rm -rf ~/.cache + # Github persists the .cache folder across different runs/containers + - name: Clear cache + if: always() + run: sudo rm -rf ~/.cache training-tests: name: Training Tests @@ -220,7 +220,7 @@ jobs: shell: bash -l {0} # required to use an activated conda environment env: CONDA: "3" - # needs: inference-tests + needs: inference-tests container: image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest options: --gpus all --shm-size=8192m From 96bc34bc1f4e42839e0ba7afd6eebc31ffbc169a Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 25 Jan 2024 18:21:42 -0500 Subject: [PATCH 11/12] update mapper interface --- deps/legion | 2 +- include/flexflow/mapper.h | 9 ++++---- src/mapper/mapper.cc | 47 +++++++++++++++++---------------------- 3 files changed, 25 insertions(+), 33 deletions(-) diff --git a/deps/legion b/deps/legion index ef6c499753..24e8c45234 160000 --- a/deps/legion +++ b/deps/legion @@ -1 +1 @@ -Subproject commit ef6c499753cebb7bb44ac52c109144a9e6a1c577 +Subproject commit 24e8c452341dea41427e0ce61e154d61715e6835 diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h index 71be1892aa..e8337818ec 100644 --- a/include/flexflow/mapper.h +++ b/include/flexflow/mapper.h @@ -83,11 +83,10 @@ class FFMapper : public NullMapper { Task const &task, MapTaskInput const &input, MapTaskOutput &output); - virtual void map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output); + virtual void replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output); virtual void select_task_variant(const MapperContext ctx, Task const &task, SelectVariantInput const &input, diff --git a/src/mapper/mapper.cc b/src/mapper/mapper.cc index bc26a79d3e..d46bfc2877 100644 --- a/src/mapper/mapper.cc +++ b/src/mapper/mapper.cc @@ -661,44 +661,37 @@ void FFMapper::map_task(const MapperContext ctx, } // for idx } -void FFMapper::map_replicate_task(const MapperContext ctx, - Task const &task, - MapTaskInput const &input, - MapTaskOutput const &default_output, - MapReplicateTaskOutput &output) { +void FFMapper::replicate_task(const MapperContext ctx, + Task const &task, + ReplicateTaskInput const &input, + ReplicateTaskOutput &output) { // Should only be replicated for the top-level task assert((task.get_depth() == 0) && (task.regions.size() == 0)); const Processor::Kind target_kind = task.target_proc.kind(); - VariantID chosen_variant; + VariantID vid; { std::vector variant_ids; - runtime->find_valid_variants( - ctx, task.task_id, variant_ids, task.target_proc.kind()); + runtime->find_valid_variants(ctx, task.task_id, variant_ids, target_kind); // Currently assume there is exactly one variant assert(variant_ids.size() == 1); - chosen_variant = variant_ids[0]; + output.chosen_variant = variant_ids[0]; } - std::vector const &all_procs = all_procs_by_kind(target_kind); - // Place on replicate on each node by default - output.task_mappings.resize(total_nodes, default_output); - // Assume default_output does not include any target_procs - assert(default_output.target_procs.size() == 0); - for (std::vector::const_iterator it = all_procs.begin(); - it != all_procs.end(); + output.target_processors.resize(total_nodes); + std::vector handled(total_nodes, false); + size_t count = 0; + Machine::ProcessorQuery procs(machine); + procs.only_kind(target_kind); + for (Machine::ProcessorQuery::iterator it = procs.begin(); it != procs.end(); it++) { - AddressSpace space = it->address_space(); - assert(space < output.task_mappings.size()); - // Add *it as a target_proc if we haven't found one - if (output.task_mappings[space].target_procs.size() == 0) { - output.task_mappings[space].target_procs.push_back(*it); + const AddressSpace space = it->address_space(); + if (handled[space]) { + continue; } + output.target_processors[space] = *it; + handled[space] = true; + count++; } - output.control_replication_map.resize(total_nodes); - for (int idx = 0; idx < total_nodes; idx++) { - output.task_mappings[idx].chosen_variant = chosen_variant; - output.control_replication_map[idx] = - output.task_mappings[idx].target_procs[0]; - } + assert(count == total_nodes); } void FFMapper::select_task_variant(const MapperContext ctx, From 573380d1ef502afdd53a01bffe38b21c2d39cd6f Mon Sep 17 00:00:00 2001 From: Zhihao Jia Date: Thu, 25 Jan 2024 22:51:55 -0500 Subject: [PATCH 12/12] add ncclFinalize --- include/flexflow/model.h | 2 ++ include/flexflow/operator.h | 5 +++ src/runtime/model.cc | 61 +++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/include/flexflow/model.h b/include/flexflow/model.h index dd6dc76b4d..95be9ab581 100644 --- a/include/flexflow/model.h +++ b/include/flexflow/model.h @@ -202,6 +202,7 @@ enum TaskIDs { // NCCL tasks NCCL_GETUNIQUEID_TASK_ID, NCCL_INIT_COMMS_TASK_ID, + NCCL_FINISH_COMMS_TASK_ID, // Search STRATEGY_SEARCH_TASK_ID, // Graph @@ -397,6 +398,7 @@ std::vector class FFModel { public: FFModel(FFConfig &config, bool cpu_offload = false); + ~FFModel(); static constexpr float PROPAGATION_CHANCE = 0.25; static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75; diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index 73c2c3e092..1b19bdb82f 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -406,6 +406,11 @@ class Op { std::vector const ®ions, Legion::Context ctx, Legion::Runtime *runtime); + static void + finish_nccl_comms_task(Legion::Task const *task, + std::vector const ®ions, + Legion::Context ctx, + Legion::Runtime *runtime); #endif protected: void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index c07c33efca..f9763627c8 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -606,6 +606,15 @@ ncclComm_t Op::init_nccl_comms_task(Task const *task, // ncclComm, allRanks, myRank, ncclId); return ncclComm; } + +void Op::finish_nccl_comms_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + ncclComm_t comm = *((ncclComm_t *)task->local_args); + checkNCCL(ncclCommFinalize(comm)); + checkNCCL(ncclCommDestroy(comm)); +} #endif /** @@ -1578,6 +1587,43 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload) model_id = model_counter++; } +FFModel::~FFModel() { + // Destroy nccl communication groups +#ifdef FF_USE_NCCL + Context ctx = config.lg_ctx; + Runtime *runtime = config.lg_hlr; + for (auto const &comm : view_hash_to_nccl_comms) { + // Find the machine view that has the hash + MachineView view; + for (size_t l = 0; l < operators.size(); l++) { + view = operators[l]->outputs[0]->machine_view; + if (view.hash() == comm.first) { + break; + } + } + assert(view.hash() == comm.first && "Cannot find the machine view"); + IndexSpace task_is = get_or_create_task_is(view); + Domain domain = runtime->get_index_space_domain(ctx, task_is); + ArgumentMap argmap; + int idx = 0; + for (Domain::DomainPointIterator it(domain); it; it++, idx++) { + argmap.set_point(*it, + TaskArgument(&comm.second[idx], sizeof(ncclComm_t))); + } + IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID, + task_is, + TaskArgument(nullptr, 0), + argmap, + Predicate::TRUE_PRED, + false /*must*/, + 0 /*mapper_id*/, + comm.first); + FutureMap fm = runtime->execute_index_space(ctx, index_launcher); + fm.wait_all_results(); + } +#endif +} + void FFModel::clear_graph_search_cache() { this->graph_search->clear_cache(); this->search->clear_cache(); @@ -6853,6 +6899,21 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } + { + TaskVariantRegistrar registrar(NCCL_FINISH_COMMS_TASK_ID, + "NCCL Finish Communicators"); + registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC)); + registrar.set_leaf(); + if (pre_register) { + Runtime::preregister_task_variant( + registrar, "NCCL Finish Communicators Task"); + } else { + if (enable_control_replication) { + registrar.global_registration = false; + } + runtime->register_task_variant(registrar); + } + } #endif // Search {