diff --git a/.github/mergify.yml b/.github/mergify.yml index 7fb20bd..7e810ae 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -25,21 +25,21 @@ pull_request_rules: - -files~=^\.github/(actions|workflows)/.*\.ya?ml$ - -files~=^\.github/workflows/actionlint\. - # e2e workflow + # e2e medium workflow - or: - and: - # note this should match the triggering criteria in 'e2e-nvidia-t4-x1.yml' - - check-success=e2e-workflow-complete + # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml' + - check-success~=e2e-medium-workflow-complete - or: - files~=\.py$ - files=pyproject.toml - files~=^requirements.*\.txt$ - - files=.github/workflows/e2e-nvidia-t4-x1.yml + - files=.github/workflows/e2e-nvidia-a10g-x1.yml - and: - -files~=\.py$ - -files=pyproject.toml - -files~=^requirements.*\.txt$ - - -files=.github/workflows/e2e-nvidia-t4-x1.yml + - -files=.github/workflows/e2e-nvidia-a10g-x1.yml # lint must pass if files change that would trigger this job - or: diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml similarity index 75% rename from .github/workflows/e2e-nvidia-t4-x1.yml rename to .github/workflows/e2e-nvidia-a10g-x1.yml index bef5be6..9a0807c 100644 --- a/.github/workflows/e2e-nvidia-t4-x1.yml +++ b/.github/workflows/e2e-nvidia-a10g-x1.yml @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -name: E2E (NVIDIA Tesla T4 x1) +name: E2E (NVIDIA A10G x1) on: # run against every merge commit to 'main' and release branches @@ -10,10 +10,6 @@ on: - release-* # only run on PRs that touch certain regex paths pull_request_target: - types: - - opened - - synchronize - - reopened branches: - main - release-* @@ -22,15 +18,24 @@ on: - '**.py' - 'pyproject.toml' - 'requirements**.txt' - - '.github/workflows/e2e-nvidia-t4-x1.yml' # This workflow + - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + jobs: - start-runner: - name: Start external EC2 runner + start-medium-ec2-runner: runs-on: ubuntu-latest outputs: label: ${{ steps.start-ec2-runner.outputs.label }} @@ -42,43 +47,41 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} + - name: Start EC2 runner id: start-ec2-runner uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 with: mode: start github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ami-00c51d9c1374eda97 - ec2-instance-type: g4dn.2xlarge + ec2-image-id: ami-01a89eee1adde309c + ec2-instance-type: g5.4xlarge subnet-id: subnet-02d230cffd9385bd4 security-group-id: sg-06300447c4a5fbef3 iam-role-name: instructlab-ci-runner aws-resource-tags: > [ - {"Key": "Name", "Value": "instructlab-ci-github-small-runner"}, + {"Key": "Name", "Value": "instructlab-ci-github-medium-runner"}, {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} ] - e2e: - name: E2E Test - needs: start-runner - runs-on: ${{ needs.start-runner.outputs.label }} + e2e-medium-test: + needs: + - start-medium-ec2-runner + runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }} # It is important that this job has no write permissions and has # no access to any secrets. This part (e2e) is where we are running # untrusted code from PRs. permissions: {} - # No step-security/harden-runner since this is a self-hosted runner steps: - # for debugging - - name: Print environment state + - name: Install Packages run: | - echo "Current Working Directory: $PWD" - echo "Files in Local Directory:" - ls -l + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel - name: Checkout instructlab/instructlab uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 @@ -95,13 +98,6 @@ jobs: # https://github.com/actions/checkout/issues/249 fetch-depth: 0 - # for debugging - - name: Print environment state - run: | - echo "Current Working Directory: $PWD" - echo "Files in Local Directory:" - ls -l - - name: Fetch and checkout PR id: fetch_pr if: github.event_name == 'pull_request_target' @@ -110,15 +106,12 @@ jobs: git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }} git checkout pr-${{ github.event.pull_request.number }} - - name: Install system packages - run: | - cat /etc/os-release - sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel - - - name: Install instructlab + - name: Install ilab working-directory: ./instructlab run: | - export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH" + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" python3.11 -m venv --upgrade-deps venv . venv/bin/activate nvidia-smi @@ -129,26 +122,26 @@ jobs: # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed python3.11 -m pip install packaging wheel setuptools-scm - python3.11 -m pip install .[cuda] + python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt - name: Update instructlab-eval library working-directory: ./eval run: | . ../instructlab/venv/bin/activate pip install . - pip install .[cuda] - name: Run e2e test working-directory: ./instructlab + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | . venv/bin/activate - ./scripts/e2e-custom.sh -msq + ./scripts/e2e-ci.sh -m - stop-runner: - name: Stop external EC2 runner + stop-medium-ec2-runner: needs: - - start-runner - - e2e + - start-medium-ec2-runner + - e2e-medium-test runs-on: ubuntu-latest if: ${{ always() }} steps: @@ -163,13 +156,13 @@ jobs: with: mode: stop github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} + label: ${{ needs.start-medium-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} - e2e-workflow-complete: + e2e-medium-workflow-complete: # we don't want to block PRs on failed EC2 cleanup # so not requiring "stop-runner" as well - needs: ["start-runner", "e2e"] + needs: ["start-medium-ec2-runner", "e2e-medium-test"] runs-on: ubuntu-latest steps: - name: E2E Workflow Complete diff --git a/README.md b/README.md index 55af0e2..049dd15 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # eval ![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main) +![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) ![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main) ![Release](https://img.shields.io/github/v/release/instructlab/eval) ![License](https://img.shields.io/github/license/instructlab/eval)