Skip to content

Commit

Permalink
chore(ci): fallback on permanent h100 instance on shortage
Browse files Browse the repository at this point in the history
When a shortage occurs on n3-H100x1 instances on Hyperstack, we'll
fall back on the permanent one registered on GitHub.
This can be done by using 'h100x1' as runner label to run a job on
it.
  • Loading branch information
soonum committed Feb 28, 2025
1 parent a508f4c commit 1bbfdb0
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 4 deletions.
9 changes: 8 additions & 1 deletion .github/workflows/gpu_fast_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
steps:
- name: Start remote instance
id: start-remote-instance
Expand All @@ -82,6 +82,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/gpu_full_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
name: Setup instance (cuda-h100-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
runner-name: ${{ steps.start-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group }}
steps:
- name: Start instance
id: start-instance
Expand All @@ -33,6 +33,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: CUDA H100 tests
needs: [ setup-instance ]
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/gpu_signed_integer_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
steps:
- name: Start remote instance
id: start-remote-instance
Expand All @@ -83,6 +83,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/gpu_unsigned_integer_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.use-permanent-instance.outputs.runner_group || steps.start-github-instance.outputs.runner_group }}
steps:
- name: Start remote instance
id: start-remote-instance
Expand All @@ -82,6 +82,13 @@ jobs:
backend: hyperstack
profile: single-h100

# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: ${{ env.SECRETS_AVAILABLE == 'true' && failure() }}
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
Expand Down

0 comments on commit 1bbfdb0

Please sign in to comment.