Add a new py3-functional-gpu workflow and tox env

This runs our functional tests that require a GPU as a separate workflow item, since they are not the instructlab e2e tests and also are not the CPU-only unit or functional tests that run in the GitHub runners. Signed-off-by: Ben Browning <[email protected]>
instructlab · Jan 7, 2025 · 6d399f2 · 6d399f2
1 parent 8b86c75
commit 6d399f2
Show file tree

Hide file tree

Showing 6 changed files with 175 additions and 5 deletions.
diff --git a/.github/workflows/functional-gpu-nvidia-t4-x1.yml b/.github/workflows/functional-gpu-nvidia-t4-x1.yml
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: Functional GPU (NVIDIA Tesla T4 x1)
+
+on:
+  # temporarily run for this PR for anything that changes this workflow file
+  pull_request:
+    paths:
+      - ".github/workflows/functional-gpu-nvidia-t4-x1.yml" # This workflow
+  # run against every merge commit to 'main' and release branches
+  push:
+    branches:
+      - main
+      - release-*
+  # only run on PRs that touch certain regex paths
+  pull_request_target:
+    branches:
+      - main
+      - release-*
+    paths:
+      #  note this should match the merging criteria in 'mergify.yml'
+      - "**.py"
+      - "pyproject.toml"
+      - "requirements**.txt"
+      - ".github/workflows/functional-gpu-nvidia-t4-x1.yml" # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  start-small-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
+          ec2-instance-type: g4dn.2xlarge
+          subnet-id: subnet-02d230cffd9385bd4
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-small-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  functional-gpu-small-test:
+    needs:
+      - start-small-ec2-runner
+    runs-on: ${{ needs.start-small-ec2-runner.outputs.label }}
+
+    # It is important that this job has no write permissions and has
+    # no access to any secrets. This part is where we are running
+    # untrusted code from PRs.
+    permissions: {}
+
+    steps:
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Checkout instructlab/sdg
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/sdg"
+          path: "sdg"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Fetch and checkout PR
+        if: github.event_name == 'pull_request_target'
+        working-directory: ./sdg
+        run: |
+          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
+          git checkout pr-${{ github.event.pull_request.number }}
+
+      - name: Install instructlab/sdg
+        working-directory: ./instructlab
+        run: |
+          export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          nvidia-smi
+          python3.11 -m pip cache remove llama_cpp_python
+
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -r requirements-dev.txt
+
+      - name: Check disk before tests
+        run: |
+          df -h
+
+      - name: Run functional gpu tests with tox
+        run: |
+          tox -e py3-functional-gpu
+
+      - name: Check disk after tests
+        run: |
+          df -h
+
+  stop-small-ec2-runner:
+    needs:
+      - start-small-ec2-runner
+      - functional-gpu-small-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-small-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-small-ec2-runner.outputs.ec2-instance-id }}
+
+  functional-gpu-small-workflow-complete:
+    # we don't want to block PRs on failed EC2 cleanup
+    # so not requiring "stop-small-ec2-runner" as well
+    needs: ["start-small-ec2-runner", "functional-gpu-small-test"]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Functional GPU Workflow Complete
+        run: echo "Functional GPU Workflow Complete"
diff --git a/pyproject.toml b/pyproject.toml
@@ -105,5 +105,5 @@ follow_imports = "silent"
 
 [tool.pytest.ini_options]
 markers = [
-    "slow: marks tests that are slow (deselect with '-m \"not slow\"')",
+    "gpu: marks tests that should run with gpus (deselect with '-m \"not gpu\"')",
 ]
diff --git a/requirements-dev-gpu.txt b/requirements-dev-gpu.txt
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
+-r requirements-dev.txt
+
+llama-cpp-python[server]>=0.3.0,<1.0.0
diff --git a/tests/functional/test_full_pipeline.py b/tests/functional/test_full_pipeline.py
@@ -36,7 +36,7 @@ def pipelines_package():
     return FULL_PIPELINES_PACKAGE
 
 
-@pytest.mark.slow
+@pytest.mark.gpu
 class TestFullPipeline(unittest.TestCase):
     @pytest.fixture(autouse=True)
     def _setup_fixtures(self, knowledge_dataset, knowledge_pipeline):

diff --git a/tests/functional/test_simple_pipeline.py b/tests/functional/test_simple_pipeline.py
@@ -34,7 +34,7 @@ def pipelines_package():
     return SIMPLE_PIPELINES_PACKAGE
 
 
-@pytest.mark.slow
+@pytest.mark.gpu
 class TestSimplePipeline(unittest.TestCase):
     @pytest.fixture(autouse=True)
     def _setup_fixtures(self, knowledge_dataset, knowledge_pipeline):

diff --git a/tox.ini b/tox.ini
@@ -17,8 +17,8 @@ wheel_build_env = pkg
 deps = -r requirements-dev.txt
 commands =
     unit: {envpython} -m pytest {posargs:tests --ignore=tests/functional}
-    unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or slow)"}
-    functional: {envpython} -m pytest {posargs:tests/functional}
+    unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or gpu)"}
+    functional: {envpython} -m pytest {posargs:tests/functional -m "not gpu"}
 
 # format, check, and linting targets don't build and install the project to
 # speed up testing.
@@ -84,6 +84,14 @@ deps = -r requirements-dev.txt
 commands =
     {envpython} ./scripts/validate_pipelines.py
 
+[testenv:py3-functional-gpu]
+description = run functional tests that require a GPU
+package = wheel
+wheel_build_env = pkg
+deps = -r requirements-dev-gpu.txt
+commands =
+    {envpython} -m pytest {posargs:tests/functional -m "gpu"}
+
 [gh]
 python =
     3.11 = py311-{unitcov, functional}