.github/workflows/transformers_amd_ci_scheduled.yaml

test call transformers full path slack-report #1

Workflow file for this run

.github/workflows/transformers_amd_ci_scheduled.yaml at be02685

	name: Self-hosted runner (scheduled-amd)

	# Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the
	# CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes
	# us towards the limit of allowed jobs on GitHub Actions.

	on:
	workflow_call:
	inputs:
	job:
	required: true
	type: string
	slack_report_channel:
	required: true
	type: string
	runner:
	required: true
	type: string
	docker:
	required: true
	type: string
	ci_event:
	required: true
	type: string

	env:
	HF_HOME: /mnt/cache
	TRANSFORMERS_IS_CI: yes
	OMP_NUM_THREADS: 8
	MKL_NUM_THREADS: 8
	RUN_SLOW: yes
	HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
	SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
	NUM_SLICES: 2

	# Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running.
	# This is done so that we avoid parallelizing the scheduled tests, to leave available
	# runners for the push CI that is running on the same machine.
	jobs:
	check_runner_status:
	name: Check Runner Status
	runs-on: ubuntu-22.04
	steps:
	- name: Checkout transformers
	uses: actions/checkout@v4
	with:
	fetch-depth: 2

	- name: Check Runner Status
	run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

	check_runners:
	name: Check Runners
	needs: check_runner_status
	strategy:
	matrix:
	machine_type: [single-gpu, multi-gpu]
	runs-on:
	[
	"${{ matrix.machine_type }}",
	self-hosted,
	amd-gpu,
	"${{ inputs.runner }}",
	]
	container:
	image: huggingface/transformers-pytorch-amd-gpu
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: ROCM-SMI
	run: \|
	rocm-smi

	- name: ROCM-INFO
	run: \|
	rocminfo \| grep "Agent" -A 14

	- name: Show ROCR environment
	run: \|
	echo "ROCR: $ROCR_VISIBLE_DEVICES"

	setup:
	if: contains(fromJSON('["run_models_gpu"]'), inputs.job)
	name: Setup
	needs: check_runners
	strategy:
	matrix:
	machine_type: [single-gpu, multi-gpu]
	runs-on:
	[
	"${{ matrix.machine_type }}",
	self-hosted,
	amd-gpu,
	"${{ inputs.runner }}",
	]
	container:
	image: huggingface/transformers-pytorch-amd-gpu
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	outputs:
	folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
	slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
	steps:
	- name: Update clone
	working-directory: /transformers
	run: \|
	git fetch && git checkout ${{ github.sha }}

	- name: Cleanup
	working-directory: /transformers
	run: \|
	rm -rf tests/__pycache__
	rm -rf tests/models/__pycache__
	rm -rf reports

	- name: Show installed libraries and their versions
	working-directory: /transformers
	run: pip freeze

	- id: set-matrix
	name: Identify models to test
	working-directory: /transformers/tests
	run: \|
	echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
	echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT

	- name: ROCM-SMI
	run: \|
	rocm-smi

	- name: ROCM-INFO
	run: \|
	rocminfo \| grep "Agent" -A 14

	- name: Show ROCR environment
	run: \|
	echo "ROCR: $ROCR_VISIBLE_DEVICES"

	- name: Environment
	working-directory: /transformers
	run: \|
	python3 utils/print_env.py

	run_models_gpu:
	if: ${{ inputs.job == 'run_models_gpu' }}
	name: Single GPU tests
	needs: setup
	strategy:
	max-parallel: 1 # For now, not to parallelize. Can change later if it works well.
	fail-fast: false
	matrix:
	machine_type: [single-gpu, multi-gpu]
	slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
	uses: ./.github/workflows/transformers_amd_model_jobs.yaml
	with:
	folder_slices: ${{ needs.setup.outputs.folder_slices }}
	machine_type: ${{ matrix.machine_type }}
	slice_id: ${{ matrix.slice_id }}
	runner: ${{ inputs.runner }}
	docker: ${{ inputs.docker }}
	secrets: inherit

	run_pipelines_torch_gpu:
	if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
	name: PyTorch pipelines
	needs: check_runners
	strategy:
	fail-fast: false
	matrix:
	machine_type: [single-gpu, multi-gpu]
	runs-on:
	[
	"${{ matrix.machine_type }}",
	self-hosted,
	amd-gpu,
	"${{ inputs.runner }}",
	]
	container:
	image: ${{ inputs.docker }}
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Update clone
	working-directory: /transformers
	run: git fetch && git checkout ${{ github.sha }}

	- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
	working-directory: /transformers
	run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

	- name: ROCM-SMI
	run: \|
	rocm-smi

	- name: ROCM-INFO
	run: \|
	rocminfo \| grep "Agent" -A 14

	- name: Show ROCR environment
	run: \|
	echo "ROCR: $ROCR_VISIBLE_DEVICES"

	- name: Environment
	working-directory: /transformers
	run: \|
	python3 utils/print_env.py

	- name: Show installed libraries and their versions
	working-directory: /transformers
	run: pip freeze

	- name: Run all pipeline tests on GPU
	working-directory: /transformers
	run: \|
	python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"

	- name: Failure short reports
	if: ${{ failure() }}
	continue-on-error: true
	run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt

	- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
	if: ${{ always() }}
	uses: actions/upload-artifact@v4
	with:
	name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
	path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports

	run_examples_gpu:
	if: ${{ inputs.job == 'run_examples_gpu' }}
	name: Examples directory
	needs: check_runners
	strategy:
	fail-fast: false
	matrix:
	machine_type: [single-gpu]
	runs-on:
	[
	"${{ matrix.machine_type }}",
	self-hosted,
	amd-gpu,
	"${{ inputs.runner }}",
	]
	container:
	image: ${{ inputs.docker }}
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Update clone
	working-directory: /transformers
	run: git fetch && git checkout ${{ github.sha }}

	- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
	working-directory: /transformers
	run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

	- name: ROCM-SMI
	run: \|
	rocm-smi

	- name: ROCM-INFO
	run: \|
	rocminfo \| grep "Agent" -A 14

	- name: Show ROCR environment
	run: \|
	echo "ROCR: $ROCR_VISIBLE_DEVICES"

	- name: Environment
	working-directory: /transformers
	run: \|
	python3 utils/print_env.py

	- name: Show installed libraries and their versions
	working-directory: /transformers
	run: pip freeze

	- name: Run examples tests on GPU
	working-directory: /transformers
	run: \|
	pip install -r examples/pytorch/_tests_requirements.txt
	python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"

	- name: Failure short reports
	if: ${{ failure() }}
	continue-on-error: true
	run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt

	- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports"
	if: ${{ always() }}
	uses: actions/upload-artifact@v4
	with:
	name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
	path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports

	run_torch_cuda_extensions_gpu:
	if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
	name: Torch ROCm deepspeed tests
	needs: check_runners
	strategy:
	fail-fast: false
	matrix:
	machine_type: [single-gpu, multi-gpu]
	runs-on:
	[
	"${{ matrix.machine_type }}",
	self-hosted,
	amd-gpu,
	"${{ inputs.runner }}",
	]
	container:
	image: ${{ inputs.docker }}
	options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
	steps:
	- name: Update clone
	working-directory: /transformers
	run: git fetch && git checkout ${{ github.sha }}

	- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
	working-directory: /transformers
	run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .

	- name: ROCM-SMI
	run: \|
	rocm-smi

	- name: ROCM-INFO
	run: \|
	rocminfo \| grep "Agent" -A 14

	- name: Show ROCR environment
	run: \|
	echo "ROCR: $ROCR_VISIBLE_DEVICES"

	- name: Environment
	working-directory: /transformers
	run: \|
	python3 utils/print_env.py

	- name: Show installed libraries and their versions
	working-directory: /transformers
	run: pip freeze

	- name: Run all tests on GPU
	working-directory: /transformers
	run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test"

	- name: Failure short reports
	if: ${{ failure() }}
	continue-on-error: true
	run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt

	- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
	if: ${{ always() }}
	uses: actions/upload-artifact@v4
	with:
	name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
	path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports

	send_results:
	name: Slack Report
	needs:
	[
	check_runner_status,
	check_runners,
	setup,
	run_models_gpu,
	run_pipelines_torch_gpu,
	run_examples_gpu,
	run_torch_cuda_extensions_gpu,
	]
	if: ${{ always() }}
	uses: huggingface/transformers/.github/workflows/slack-report.yml
Check failure on line 369 in .github/workflows/transformers_amd_ci_scheduled.yaml View workflow run for this annotation GitHub Actions / .github/workflows/transformers_amd_ci_scheduled.yaml Invalid workflow file `invalid value workflow reference: no version specified`
	with:
	job: ${{ inputs.job }}
	# This would be `skipped` if `setup` is skipped.
	setup_status: ${{ needs.setup.result }}
	slack_report_channel: ${{ inputs.slack_report_channel }}
	# This would be an empty string if `setup` is skipped.
	folder_slices: ${{ needs.setup.outputs.folder_slices }}
	quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
	ci_event: ${{ inputs.ci_event }}

	secrets: inherit

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

test call transformers full path slack-report #1

Workflow file

test call transformers full path slack-report #1

Jobs

Run details

Workflow file for this run

GitHub Actions / .github/workflows/transformers_amd_ci_scheduled.yaml