Skip to content

Commit

Permalink
chore(ci): fallback on permanent h100 instance on shortage
Browse files Browse the repository at this point in the history
When a shortage occurs on n3-H100x1 instances on Hyperstack, we'll
fall back on the permanent one registered on GitHub.
This can be done by using 'h100x1' as runner label to run a job on
it.
  • Loading branch information
soonum committed Feb 28, 2025
1 parent a508f4c commit 48fee2a
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 21 deletions.
18 changes: 14 additions & 4 deletions .github/workflows/benchmark_gpu_core_crypto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ jobs:
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start instance
id: start-instance
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
Expand All @@ -36,6 +38,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
cuda-core-crypto-benchmarks:
name: Execute GPU core crypto benchmarks
needs: setup-instance
Expand All @@ -57,6 +66,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
Expand Down Expand Up @@ -128,7 +138,7 @@ jobs:

teardown-instance:
name: Teardown instance (cuda-integer-full-benchmarks)
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
Expand Down
18 changes: 14 additions & 4 deletions .github/workflows/benchmark_gpu_erc20_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,12 @@ jobs:
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start instance
id: start-instance
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
Expand All @@ -63,6 +65,13 @@ jobs:
backend: ${{ inputs.backend }}
profile: ${{ inputs.profile }}

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
cuda-erc20-benchmarks:
name: Cuda ERC20 benchmarks (${{ inputs.profile }})
needs: setup-instance
Expand All @@ -84,6 +93,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
Expand Down Expand Up @@ -154,7 +164,7 @@ jobs:

teardown-instance:
name: Teardown instance (cuda-erc20-${{ inputs.profile }}-benchmarks)
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-erc20-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
Expand Down
18 changes: 14 additions & 4 deletions .github/workflows/benchmark_gpu_integer_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,12 @@ jobs:
needs: prepare-matrix
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start instance
id: start-instance
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
Expand All @@ -127,6 +129,13 @@ jobs:
backend: ${{ inputs.backend }}
profile: ${{ inputs.profile }}

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() && inputs.profile == 'single-h100' }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
cuda-benchmarks:
name: Cuda benchmarks (${{ inputs.profile }})
needs: [ prepare-matrix, setup-instance ]
Expand Down Expand Up @@ -154,6 +163,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
Expand Down Expand Up @@ -230,7 +240,7 @@ jobs:

teardown-instance:
name: Teardown instance (cuda-${{ inputs.profile }}-benchmarks)
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
Expand Down
14 changes: 12 additions & 2 deletions .github/workflows/gpu_fast_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,13 @@ jobs:
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
Expand All @@ -82,6 +84,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure' }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
Expand Down Expand Up @@ -114,6 +123,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}

- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
Expand Down Expand Up @@ -159,7 +169,7 @@ jobs:

teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
Expand Down
17 changes: 14 additions & 3 deletions .github/workflows/gpu_full_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ jobs:
name: Setup instance (cuda-h100-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start instance
id: start-instance
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
Expand All @@ -33,6 +35,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: CUDA H100 tests
needs: [ setup-instance ]
Expand Down Expand Up @@ -68,6 +77,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
Expand Down Expand Up @@ -109,6 +119,7 @@ jobs:

teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
Expand Down
14 changes: 12 additions & 2 deletions .github/workflows/gpu_signed_integer_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,13 @@ jobs:
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
Expand All @@ -83,6 +85,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
Expand Down Expand Up @@ -115,6 +124,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}

- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
Expand Down Expand Up @@ -146,7 +156,7 @@ jobs:

teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
Expand Down
14 changes: 12 additions & 2 deletions .github/workflows/gpu_unsigned_integer_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,13 @@ jobs:
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
Expand All @@ -82,6 +84,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
Expand Down Expand Up @@ -114,6 +123,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}

- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
Expand Down Expand Up @@ -145,7 +155,7 @@ jobs:

teardown-instance:
name: Teardown instance (cuda-h100-tests)
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
Expand Down

0 comments on commit 48fee2a

Please sign in to comment.