gpu-ci #3229
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "gpu-ci" | |
on: | |
schedule: | |
- cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28. | |
push: | |
branches: | |
- "inference" | |
paths: | |
- "cmake/**" | |
- "config/**" | |
- "deps/**" | |
- "python/**" | |
- "setup.py" | |
- "include/**" | |
- "inference/**" | |
- "src/**" | |
- "tests/inference/**" | |
- "conda/flexflow.yml" | |
- ".github/workflows/gpu-ci.yml" | |
- "tests/cpp_gpu_tests.sh" | |
- "tests/inference_tests.sh" | |
- "tests/training_tests.sh" | |
- "tests/python_interface_test.sh" | |
workflow_dispatch: | |
concurrency: | |
group: gpu-ci-${{ github.head_ref || github.run_id }} | |
cancel-in-progress: true | |
jobs: | |
gpu-ci-concierge: | |
name: GPU CI Concierge | |
runs-on: ubuntu-20.04 | |
env: | |
FLEXFLOW_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
steps: | |
- name: Checkout Git Repository | |
uses: actions/checkout@v3 | |
- name: Wait for daemon to be done | |
run: | | |
pip3 install pip --upgrade | |
pip3 install pyopenssl --upgrade | |
pip3 install urllib3 --upgrade | |
pip3 install pygithub | |
python3 .github/workflows/helpers/gpu_ci_helper.py | |
keep-runner-registered: | |
name: Keep runner alive | |
if: ${{ github.event_name == 'schedule' }} | |
runs-on: [self-hosted, gpu] | |
defaults: | |
run: | |
shell: bash -l {0} # required to use an activated conda environment | |
env: | |
CONDA: "3" | |
needs: gpu-ci-concierge | |
container: | |
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest | |
options: --gpus all --shm-size=8192m | |
steps: | |
- name: Keep alive | |
run: | | |
echo "Keep self-hosted runner registered with Github" | |
sleep 10m | |
python-interface-check: | |
name: Check Python Interface | |
if: ${{ github.event_name != 'schedule' }} | |
runs-on: [self-hosted, gpu] | |
defaults: | |
run: | |
shell: bash -l {0} # required to use an activated conda environment | |
env: | |
CONDA: "3" | |
needs: gpu-ci-concierge | |
container: | |
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest | |
options: --gpus all --shm-size=8192m | |
steps: | |
- name: Install updated git version | |
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git | |
- name: Checkout Git Repository | |
uses: actions/checkout@v3 | |
with: | |
submodules: recursive | |
- name: Install conda and FlexFlow dependencies | |
uses: conda-incubator/setup-miniconda@v2 | |
with: | |
miniconda-version: "latest" | |
activate-environment: flexflow | |
environment-file: conda/flexflow.yml | |
auto-activate-base: false | |
auto-update-conda: false | |
- name: Install conda and Pytorch dependencies for pytorch alignment test | |
run: | | |
conda env create -f conda/pytorch-gpu.yml | |
- name: Build FlexFlow | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion | |
mkdir build | |
cd build | |
../config/config.linux | |
make -j | |
- name: Check FlexFlow Python interface (before installation) | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib | |
./tests/python_interface_test.sh before-installation | |
- name: Install FlexFlow | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion | |
cd build | |
../config/config.linux | |
make install | |
ldconfig | |
- name: Check FlexFlow Python interface (after installation) | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib | |
./tests/python_interface_test.sh after-installation | |
- name: Run flexflow alignment with pytorch | |
run: | | |
# run alingment tests | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib | |
./tests/align/test_all_operators.sh | |
inference-tests: | |
name: Inference Tests | |
if: ${{ github.event_name != 'schedule' }} | |
runs-on: [self-hosted, gpu] | |
defaults: | |
run: | |
shell: bash -l {0} # required to use an activated conda environment | |
env: | |
CONDA: "3" | |
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} | |
needs: gpu-ci-concierge | |
container: | |
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest | |
options: --gpus all --shm-size=8192m | |
steps: | |
- name: Install updated git version | |
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git | |
- name: Checkout Git Repository | |
uses: actions/checkout@v3 | |
with: | |
submodules: recursive | |
- name: Install conda and FlexFlow dependencies | |
uses: conda-incubator/setup-miniconda@v2 | |
with: | |
miniconda-version: "latest" | |
activate-environment: flexflow | |
environment-file: conda/flexflow.yml | |
auto-activate-base: false | |
- name: Build FlexFlow | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion | |
export FF_BUILD_INFERENCE=ON | |
mkdir build | |
cd build | |
../config/config.linux | |
make -j | |
- name: Run PEFT tests | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export CUDNN_DIR=/usr/local/cuda | |
export CUDA_DIR=/usr/local/cuda | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib | |
source ./build/set_python_envs.sh | |
./tests/peft_test.sh | |
- name: Run inference tests | |
env: | |
CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export CUDNN_DIR=/usr/local/cuda | |
export CUDA_DIR=/usr/local/cuda | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib | |
# GPT tokenizer test | |
# ./tests/gpt_tokenizer_test.sh | |
# Inference tests | |
source ./build/set_python_envs.sh | |
./tests/inference_tests.sh | |
- name: Save inference output as an artifact | |
if: always() | |
run: | | |
cd inference | |
tar -zcvf output.tar.gz ./output | |
- name: Upload artifact | |
uses: actions/upload-artifact@v3 | |
if: always() | |
with: | |
name: output | |
path: inference/output.tar.gz | |
# Github persists the .cache folder across different runs/containers | |
- name: Clear cache | |
if: always() | |
run: sudo rm -rf ~/.cache | |
training-tests: | |
name: Training Tests | |
if: ${{ github.event_name != 'schedule' }} | |
runs-on: [self-hosted, gpu] | |
# skip this time-consuming test for PRs to the inference branch | |
# if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} | |
defaults: | |
run: | |
shell: bash -l {0} # required to use an activated conda environment | |
env: | |
CONDA: "3" | |
needs: inference-tests | |
container: | |
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest | |
options: --gpus all --shm-size=8192m | |
steps: | |
- name: Install updated git version | |
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git | |
- name: Checkout Git Repository | |
uses: actions/checkout@v3 | |
with: | |
submodules: recursive | |
- name: Install conda and FlexFlow dependencies | |
uses: conda-incubator/setup-miniconda@v2 | |
with: | |
miniconda-version: "latest" | |
activate-environment: flexflow | |
environment-file: conda/flexflow.yml | |
auto-activate-base: false | |
- name: Build and Install FlexFlow | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export FF_BUILD_TRAINING_EXAMPLES=ON | |
export FF_BUILD_INFERENCE=ON | |
export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion | |
pip install . --verbose | |
- name: Check FlexFlow Python interface (pip) | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export FF_HOME=$(pwd) | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib | |
./tests/python_interface_test.sh after-installation | |
- name: Run multi-gpu tests | |
run: | | |
export PATH=$CONDA_PREFIX/bin:$PATH | |
export CUDNN_DIR=/usr/local/cuda | |
export CUDA_DIR=/usr/local/cuda | |
export FF_HOME=$(pwd) | |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib | |
# C++ tests | |
./tests/cpp_gpu_tests.sh 4 | |
# Python tests | |
./tests/training_tests.sh 4 | |