Merge branch 'inference' into mixtral

flexflow · Nov 28, 2024 · 59a47a4 · 59a47a4
2 parents 3d355c7 + 5185438
commit 59a47a4
Show file tree

Hide file tree

Showing 409 changed files with 43,717 additions and 11,324 deletions.
diff --git a/.github/README.md b/.github/README.md
@@ -4,12 +4,6 @@
 
 ---
 
-## News🔥:
-
-* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6
-* [08/16/2023] Adding Starcoder model support
-* [08/14/2023] Released Docker images for different CUDA versions
-
 ## What is FlexFlow Serve
 
 The high computational and memory requirements of generative large language
@@ -178,7 +172,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui
 For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
 
 ```bash
-./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
 ```
 </details>
 

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -52,13 +52,14 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/[email protected].11
+        uses: Jimver/[email protected].16
         if: ${{ matrix.gpu_backend == 'cuda' }}
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
           use-github-cache: "false"
+          log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -78,13 +79,13 @@ jobs:
           export FF_CUDA_ARCH=70
           export FF_HIP_ARCH=gfx1100,gfx1036
           export hip_version=5.6
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
 
           if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-            export FF_BUILD_ALL_EXAMPLES=ON
+            export FF_BUILD_TRAINING_EXAMPLES=ON
             export FF_BUILD_UNIT_TESTS=ON
           else 
-            export FF_BUILD_ALL_EXAMPLES=OFF
+            export FF_BUILD_TRAINING_EXAMPLES=OFF
             export FF_BUILD_UNIT_TESTS=OFF
           fi
 
@@ -105,13 +106,13 @@ jobs:
           export FF_CUDA_ARCH=70
           export FF_HIP_ARCH=gfx1100,gfx1036
           export hip_version=5.6
-          export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+          export FF_BUILD_INFERENCE=ON
           
           if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-            export FF_BUILD_ALL_EXAMPLES=ON
+            export FF_BUILD_TRAINING_EXAMPLES=ON
             export FF_BUILD_UNIT_TESTS=ON
           else 
-            export FF_BUILD_ALL_EXAMPLES=OFF
+            export FF_BUILD_TRAINING_EXAMPLES=OFF
             export FF_BUILD_UNIT_TESTS=OFF
           fi
 
@@ -156,11 +157,12 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/[email protected].11
+        uses: Jimver/[email protected].16
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           use-github-cache: "false"
+          log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -169,7 +171,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v2
         with:
           activate-environment: flexflow
-          environment-file: conda/environment.yml
+          environment-file: conda/flexflow.yml
           auto-activate-base: false
 
       - name: Build FlexFlow

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -9,9 +9,9 @@ on:
     branches:
       - "inference"
       - "master"
-  # schedule:
-  #   # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
-  #   - cron: "0 8 * * 0"
+  schedule:
+    # At 00:00 on day-of-month 1, 14, and 28.
+    - cron: "0 0 1,14,28 * *" 
   workflow_dispatch:
 
 # Cancel outdated workflows if they are still running
@@ -20,26 +20,22 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  oracle-runner-start:
-    name: Start an Oracle instance to build the ROCM Docker images
+  rocm-builder-start:
+    name: Start an AWS instance to build the ROCM Docker images
     runs-on: ubuntu-latest
     if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     env: 
-      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
-      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
-      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
-      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
-      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
-      OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }}
+      ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
     steps:
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
 
-      - name: Install Oracle Cloud Infrastructure library
-        run: pip install oci
-
-      - name: Start Oracle Machine
-        run: python3 .github/workflows/helpers/oracle_con.py --start --instance_id $OCI_INSTANCE_ID
+      - name: Start EC2 instance
+        run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
 
   docker-build-rocm:
     name: Build and Install FlexFlow in a Docker Container (ROCm backend)
@@ -62,13 +58,28 @@ jobs:
 
       - name: Check availability of flexflow modules in Python
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
-
+
+  keep-runner-registered:
+    name: Keep runner alive
+    if: ${{ github.event_name == 'schedule' }}
+    runs-on: [self-hosted, rocm_builder]
+    defaults:
+      run:
+        shell: bash -l {0} # required to use an activated conda environment
+    env: 
+      CONDA: "3"    
+    needs: rocm-builder-start
+    steps:
+      - name: Keep alive
+        run: |
+          echo "Keep self-hosted runner registered with Github"
+          sleep 10m  
 
   docker-build-and-publish-rocm:
     name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
-    needs: oracle-runner-start
-    runs-on: [self-hosted, cpu_only]
-    if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+    needs: rocm-builder-start
+    runs-on: [self-hosted, rocm_builder]
+    if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     strategy:
       matrix:
         hip_version: ["5.3", "5.4", "5.5", "5.6"]
@@ -110,19 +121,19 @@ jobs:
       cuda_version: ${{ matrix.cuda_version }}
     steps:
       - name: Checkout Git Repository
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
       - name: Free additional space on runner
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Build Docker container
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         env:
-          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+          deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
           build_needed: ${{ matrix.cuda_version == '12.0' }}
         run: |
           # On push to inference, build for all compatible architectures, so that we can publish 
@@ -137,44 +148,40 @@ jobs:
           fi
 
       - name: Check availability of flexflow modules in Python
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
 
       - name: Publish Docker environment image (on push to inference)
-        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+        if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
         env:
           FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
         run: |
           ./docker/publish.sh flexflow-environment
           ./docker/publish.sh flexflow
 
-  oracle-runner-stop:
-    needs: docker-build-and-publish-rocm
+  rocm-builder-stop:
+    needs: [docker-build-and-publish-rocm, keep-runner-registered]
     if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
     runs-on: ubuntu-latest
-    name: Stop the Oracle instance we used to build the ROCM Docker images
+    name: Stop the AWS instance we used to build the ROCM Docker images
     env:
-      OCI_CLI_USER: ${{ secrets.OCI_CLI_USER }}
-      OCI_CLI_TENANCY: ${{ secrets.OCI_CLI_TENANCY }}
-      OCI_CLI_FINGERPRINT: ${{ secrets.OCI_CLI_FINGERPRINT }}
-      OCI_CLI_KEY_CONTENT: ${{ secrets.OCI_CLI_KEY_CONTENT }}
-      OCI_CLI_REGION: ${{ secrets.OCI_CLI_REGION }} 
-      OCI_INSTANCE_ID: ${{ secrets.OCI_INSTANCE_ID }}
+      ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
     steps:
-      - name: Checkout Git Repository
-        uses: actions/checkout@v3
-
-      - name: Install Oracle Cloud Infrastructure library
-        run: pip install oci
-
-      - name: Stop Oracle Machine
-        run: python3 .github/workflows/helpers/oracle_con.py --stop --instance_id $OCI_INSTANCE_ID
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
+
+      - name: Start EC2 instance
+        run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
 
   notify-slack:
     name: Notify Slack in case of failure
     runs-on: ubuntu-20.04
     needs: [docker-build-cuda, docker-build-and-publish-rocm]
-    if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
+    if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }}
     steps:
       - name: Send Slack message
         env: