test call transformers full path slack-report #1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted runner (scheduled-amd) | ||
# Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the | ||
# CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes | ||
# us towards the limit of allowed jobs on GitHub Actions. | ||
on: | ||
workflow_call: | ||
inputs: | ||
job: | ||
required: true | ||
type: string | ||
slack_report_channel: | ||
required: true | ||
type: string | ||
runner: | ||
required: true | ||
type: string | ||
docker: | ||
required: true | ||
type: string | ||
ci_event: | ||
required: true | ||
type: string | ||
env: | ||
HF_HOME: /mnt/cache | ||
TRANSFORMERS_IS_CI: yes | ||
OMP_NUM_THREADS: 8 | ||
MKL_NUM_THREADS: 8 | ||
RUN_SLOW: yes | ||
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} | ||
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} | ||
NUM_SLICES: 2 | ||
# Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running. | ||
# This is done so that we avoid parallelizing the scheduled tests, to leave available | ||
# runners for the push CI that is running on the same machine. | ||
jobs: | ||
check_runner_status: | ||
name: Check Runner Status | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Checkout transformers | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 2 | ||
- name: Check Runner Status | ||
run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} | ||
check_runners: | ||
name: Check Runners | ||
needs: check_runner_status | ||
strategy: | ||
matrix: | ||
machine_type: [single-gpu, multi-gpu] | ||
runs-on: | ||
[ | ||
"${{ matrix.machine_type }}", | ||
self-hosted, | ||
amd-gpu, | ||
"${{ inputs.runner }}", | ||
] | ||
container: | ||
image: huggingface/transformers-pytorch-amd-gpu | ||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||
steps: | ||
- name: ROCM-SMI | ||
run: | | ||
rocm-smi | ||
- name: ROCM-INFO | ||
run: | | ||
rocminfo | grep "Agent" -A 14 | ||
- name: Show ROCR environment | ||
run: | | ||
echo "ROCR: $ROCR_VISIBLE_DEVICES" | ||
setup: | ||
if: contains(fromJSON('["run_models_gpu"]'), inputs.job) | ||
name: Setup | ||
needs: check_runners | ||
strategy: | ||
matrix: | ||
machine_type: [single-gpu, multi-gpu] | ||
runs-on: | ||
[ | ||
"${{ matrix.machine_type }}", | ||
self-hosted, | ||
amd-gpu, | ||
"${{ inputs.runner }}", | ||
] | ||
container: | ||
image: huggingface/transformers-pytorch-amd-gpu | ||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||
outputs: | ||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} | ||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} | ||
steps: | ||
- name: Update clone | ||
working-directory: /transformers | ||
run: | | ||
git fetch && git checkout ${{ github.sha }} | ||
- name: Cleanup | ||
working-directory: /transformers | ||
run: | | ||
rm -rf tests/__pycache__ | ||
rm -rf tests/models/__pycache__ | ||
rm -rf reports | ||
- name: Show installed libraries and their versions | ||
working-directory: /transformers | ||
run: pip freeze | ||
- id: set-matrix | ||
name: Identify models to test | ||
working-directory: /transformers/tests | ||
run: | | ||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT | ||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT | ||
- name: ROCM-SMI | ||
run: | | ||
rocm-smi | ||
- name: ROCM-INFO | ||
run: | | ||
rocminfo | grep "Agent" -A 14 | ||
- name: Show ROCR environment | ||
run: | | ||
echo "ROCR: $ROCR_VISIBLE_DEVICES" | ||
- name: Environment | ||
working-directory: /transformers | ||
run: | | ||
python3 utils/print_env.py | ||
run_models_gpu: | ||
if: ${{ inputs.job == 'run_models_gpu' }} | ||
name: Single GPU tests | ||
needs: setup | ||
strategy: | ||
max-parallel: 1 # For now, not to parallelize. Can change later if it works well. | ||
fail-fast: false | ||
matrix: | ||
machine_type: [single-gpu, multi-gpu] | ||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} | ||
uses: ./.github/workflows/transformers_amd_model_jobs.yaml | ||
with: | ||
folder_slices: ${{ needs.setup.outputs.folder_slices }} | ||
machine_type: ${{ matrix.machine_type }} | ||
slice_id: ${{ matrix.slice_id }} | ||
runner: ${{ inputs.runner }} | ||
docker: ${{ inputs.docker }} | ||
secrets: inherit | ||
run_pipelines_torch_gpu: | ||
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} | ||
name: PyTorch pipelines | ||
needs: check_runners | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
machine_type: [single-gpu, multi-gpu] | ||
runs-on: | ||
[ | ||
"${{ matrix.machine_type }}", | ||
self-hosted, | ||
amd-gpu, | ||
"${{ inputs.runner }}", | ||
] | ||
container: | ||
image: ${{ inputs.docker }} | ||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||
steps: | ||
- name: Update clone | ||
working-directory: /transformers | ||
run: git fetch && git checkout ${{ github.sha }} | ||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | ||
working-directory: /transformers | ||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | ||
- name: ROCM-SMI | ||
run: | | ||
rocm-smi | ||
- name: ROCM-INFO | ||
run: | | ||
rocminfo | grep "Agent" -A 14 | ||
- name: Show ROCR environment | ||
run: | | ||
echo "ROCR: $ROCR_VISIBLE_DEVICES" | ||
- name: Environment | ||
working-directory: /transformers | ||
run: | | ||
python3 utils/print_env.py | ||
- name: Show installed libraries and their versions | ||
working-directory: /transformers | ||
run: pip freeze | ||
- name: Run all pipeline tests on GPU | ||
working-directory: /transformers | ||
run: | | ||
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test" | ||
- name: Failure short reports | ||
if: ${{ failure() }} | ||
continue-on-error: true | ||
run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt | ||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports" | ||
if: ${{ always() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports | ||
path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports | ||
run_examples_gpu: | ||
if: ${{ inputs.job == 'run_examples_gpu' }} | ||
name: Examples directory | ||
needs: check_runners | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
machine_type: [single-gpu] | ||
runs-on: | ||
[ | ||
"${{ matrix.machine_type }}", | ||
self-hosted, | ||
amd-gpu, | ||
"${{ inputs.runner }}", | ||
] | ||
container: | ||
image: ${{ inputs.docker }} | ||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||
steps: | ||
- name: Update clone | ||
working-directory: /transformers | ||
run: git fetch && git checkout ${{ github.sha }} | ||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | ||
working-directory: /transformers | ||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | ||
- name: ROCM-SMI | ||
run: | | ||
rocm-smi | ||
- name: ROCM-INFO | ||
run: | | ||
rocminfo | grep "Agent" -A 14 | ||
- name: Show ROCR environment | ||
run: | | ||
echo "ROCR: $ROCR_VISIBLE_DEVICES" | ||
- name: Environment | ||
working-directory: /transformers | ||
run: | | ||
python3 utils/print_env.py | ||
- name: Show installed libraries and their versions | ||
working-directory: /transformers | ||
run: pip freeze | ||
- name: Run examples tests on GPU | ||
working-directory: /transformers | ||
run: | | ||
pip install -r examples/pytorch/_tests_requirements.txt | ||
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test" | ||
- name: Failure short reports | ||
if: ${{ failure() }} | ||
continue-on-error: true | ||
run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt | ||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports" | ||
if: ${{ always() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports | ||
path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports | ||
run_torch_cuda_extensions_gpu: | ||
if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }} | ||
name: Torch ROCm deepspeed tests | ||
needs: check_runners | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
machine_type: [single-gpu, multi-gpu] | ||
runs-on: | ||
[ | ||
"${{ matrix.machine_type }}", | ||
self-hosted, | ||
amd-gpu, | ||
"${{ inputs.runner }}", | ||
] | ||
container: | ||
image: ${{ inputs.docker }} | ||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ | ||
steps: | ||
- name: Update clone | ||
working-directory: /transformers | ||
run: git fetch && git checkout ${{ github.sha }} | ||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build) | ||
working-directory: /transformers | ||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . | ||
- name: ROCM-SMI | ||
run: | | ||
rocm-smi | ||
- name: ROCM-INFO | ||
run: | | ||
rocminfo | grep "Agent" -A 14 | ||
- name: Show ROCR environment | ||
run: | | ||
echo "ROCR: $ROCR_VISIBLE_DEVICES" | ||
- name: Environment | ||
working-directory: /transformers | ||
run: | | ||
python3 utils/print_env.py | ||
- name: Show installed libraries and their versions | ||
working-directory: /transformers | ||
run: pip freeze | ||
- name: Run all tests on GPU | ||
working-directory: /transformers | ||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test" | ||
- name: Failure short reports | ||
if: ${{ failure() }} | ||
continue-on-error: true | ||
run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt | ||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" | ||
if: ${{ always() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports | ||
path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports | ||
send_results: | ||
name: Slack Report | ||
needs: | ||
[ | ||
check_runner_status, | ||
check_runners, | ||
setup, | ||
run_models_gpu, | ||
run_pipelines_torch_gpu, | ||
run_examples_gpu, | ||
run_torch_cuda_extensions_gpu, | ||
] | ||
if: ${{ always() }} | ||
uses: huggingface/transformers/.github/workflows/slack-report.yml | ||
with: | ||
job: ${{ inputs.job }} | ||
# This would be `skipped` if `setup` is skipped. | ||
setup_status: ${{ needs.setup.result }} | ||
slack_report_channel: ${{ inputs.slack_report_channel }} | ||
# This would be an empty string if `setup` is skipped. | ||
folder_slices: ${{ needs.setup.outputs.folder_slices }} | ||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} | ||
ci_event: ${{ inputs.ci_event }} | ||
secrets: inherit |