chore(ci): fallback on permanent h100 instance on shortage

When a shortage occurs on n3-H100x1 instances on Hyperstack, we'll fall back on the permanent one registered on GitHub. This can be done by using 'h100x1' as runner label to run a job on it.
zama-ai · Feb 28, 2025 · 48fee2a · 48fee2a
1 parent a508f4c
commit 48fee2a
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 21 deletions.
diff --git a/.github/workflows/benchmark_gpu_core_crypto.yml b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -23,10 +23,12 @@ jobs:
     if: github.event_name != 'schedule' ||
       (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label  || steps.use-permanent-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -36,6 +38,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-core-crypto-benchmarks:
     name: Execute GPU core crypto benchmarks
     needs: setup-instance
@@ -57,6 +66,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -128,7 +138,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-integer-full-benchmarks)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/benchmark_gpu_erc20_common.yml b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -50,10 +50,12 @@ jobs:
     if:  github.event_name == 'workflow_dispatch' ||
       (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label  || steps.use-permanent-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -63,6 +65,13 @@ jobs:
           backend: ${{ inputs.backend }}
           profile: ${{ inputs.profile }}
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-erc20-benchmarks:
     name: Cuda ERC20 benchmarks (${{ inputs.profile }})
     needs: setup-instance
@@ -84,6 +93,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -154,7 +164,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/benchmark_gpu_integer_common.yml b/.github/workflows/benchmark_gpu_integer_common.yml
@@ -114,10 +114,12 @@ jobs:
     needs: prepare-matrix
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label  || steps.use-permanent-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -127,6 +129,13 @@ jobs:
           backend: ${{ inputs.backend }}
           profile: ${{ inputs.profile }}
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-benchmarks:
     name: Cuda benchmarks (${{ inputs.profile }})
     needs: [ prepare-matrix, setup-instance ]
@@ -154,6 +163,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -230,7 +240,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-benchmarks, slack-notify ]
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/gpu_fast_h100_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml
@@ -68,11 +68,13 @@ jobs:
       (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
       - name: Start remote instance
         id: start-remote-instance
         if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -82,6 +84,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure' }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
       # This instance will be spawned especially for pull-request from forked repository
       - name: Start GitHub instance
         id: start-github-instance
@@ -114,6 +123,7 @@ jobs:
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -159,7 +169,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/gpu_full_h100_tests.yml b/.github/workflows/gpu_full_h100_tests.yml
@@ -20,10 +20,12 @@ jobs:
     name: Setup instance (cuda-h100-tests)
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
-      - name: Start instance
-        id: start-instance
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -33,6 +35,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
   cuda-tests-linux:
     name: CUDA H100 tests
     needs: [ setup-instance ]
@@ -68,6 +77,7 @@ jobs:
           token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -109,6 +119,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/gpu_signed_integer_h100_tests.yml b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -69,11 +69,13 @@ jobs:
       (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
       - name: Start remote instance
         id: start-remote-instance
         if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -83,6 +85,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
       # This instance will be spawned especially for pull-request from forked repository
       - name: Start GitHub instance
         id: start-github-instance
@@ -115,6 +124,7 @@ jobs:
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -146,7 +156,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/gpu_unsigned_integer_h100_tests.yml b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -68,11 +68,13 @@ jobs:
       (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
     runs-on: ubuntu-latest
     outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
     steps:
       - name: Start remote instance
         id: start-remote-instance
         if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
         uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
         with:
           mode: start
@@ -82,6 +84,13 @@ jobs:
           backend: hyperstack
           profile: single-h100
 
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
       # This instance will be spawned especially for pull-request from forked repository
       - name: Start GitHub instance
         id: start-github-instance
@@ -114,6 +123,7 @@ jobs:
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
         uses: ./.github/actions/gpu_setup
         with:
           cuda-version: ${{ matrix.cuda }}
@@ -145,7 +155,7 @@ jobs:
 
   teardown-instance:
     name: Teardown instance (cuda-h100-tests)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
     needs: [ setup-instance, cuda-tests-linux ]
     runs-on: ubuntu-latest
     steps: