Training and Validation e2e with Single Server

Training and Validation e2e with Single Server #115

Workflow file for this run

.github/workflows/equinix_metal_e2e.yml at 4d3760b

	name: Training and Validation e2e with Single Server

	on:
	workflow_dispatch:
	inputs:
	target_models:
	description: 'Comma Separated List of Models to Train. Format: model_type/feature_type/trainer_name'
	required: false
	default: 'AbsPower/BPFOnly/SGDRegressorTrainer,AbsPower/BPFOnly/ExponentialRegressionTrainer,AbsPower/BPFOnly/LogarithmicRegressionTrainer,AbsPower/BPFOnly/PolynomialRegressionTrainer,AbsPower/BPFOnly/XgboostFitTrainer'
	model_server_image:
	description: 'Model Server Image to use for validation'
	required: false
	default: 'quay.io/sustainable_computing_io/kepler_model_server:latest'

	permissions:
	pull-requests: write
	contents: write
	repository-projects: write
	packages: write

	jobs:
	Create-runner:
	name: "Create Runner"
	uses: ./.github/workflows/create_equinix_runner.yml
	secrets: inherit

	Validate:
	name: "Validate"
	needs: Create-runner
	runs-on: self-hosted
	continue-on-error: true
	outputs:
	runner-name: ${{ runner.name }}

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Run Setup Runner Action
	uses: ./.github/actions/setup-action

	- name: Run Setup Playbooks
	env:
	MODEL_SERVER_IMAGE: ${{ github.event.inputs.model_server_image }}
	run: \|
	cd ${GITHUB_WORKSPACE}/ansible
	echo "Create VM"
	ansible-playbook -i inventory.yml kvm_playbook.yml
	echo "Install SSH tunnel"
	ansible-playbook ssh_tunnel_playbook.yml
	echo "Install Prometheus"
	ansible-playbook -i inventory.yml metrics_playbook.yml
	echo "Install Node Exporter"
	ansible-playbook -i inventory.yml node_exporter_playbook.yml -vvv
	echo "Verify node-exporter"
	sudo systemctl status node_exporter \|\| true
	sudo ss -tuln \| grep 9100 \|\| true
	curl -s localhost:9100/metrics \| grep collector \|\| true
	echo "Install Kepler"
	ansible-playbook -i inventory.yml -vvv kepler_playbook.yml
	echo "Create ssh tunnel"
	ansible-playbook -i inventory.yml ssh_tunnel_playbook.yml

	echo "Install Model Server"
	ansible-playbook -i inventory.yml -vvv model_server_playbook.yml

	- name: Run Trainer Action
	uses: ./.github/actions/train-action
	with:
	model_export_path: /tmp/trained-equinix-models
	total_runtime_seconds: 300

	- name: Checkout code
	uses: actions/checkout@v4

	- name: Run Validation Playbooks
	env:
	TOTAL_RUNTIME_SECONDS: 300
	VALIDATOTR_CURVE_TYPE: "default"
	run: \|
	cd ${GITHUB_WORKSPACE}/ansible
	echo "Pass Trained Models to VM"
	ansible-playbook -i inventory.yml -v deploy_http_model_server.yml

	export DATE_STR=$(date +%Y-%m-%d_%H-%M-%S)
	export DATE_STRING=$(date +%Y-%m-%d)
	mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}

	target_models_list="${{ github.event.inputs.target_models }}"
	IFS=',' read -r -a models <<< "$target_models_list"
	for model in "${models[@]}"
	do
	echo "Running Model Server Playbook: $model"
	model_url="http://localhost:8080/${model}_0.zip"
	echo "Model exists with sufficient accuracy: $model_url"
	ansible-playbook -i inventory.yml -vvv model_server_restart.yml \
	-e "node_components_init_url=$model_url"
	echo "Run validation test"
	ansible-playbook -vvv kepler_validator.yml
	echo "Validation Finished"

	export KEPLER_TAG=$(ls -d /tmp/validator-* \|tail -1 \| sed 's/.*validator-//g')

	FILE="/tmp/validator-/"
	mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model}
	mv $FILE ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model}

	echo "\| " ${DATE_STRING} " \| " ${model}-equinix " \| [Report](train-validate-e2e/${DATE_STR}/${model}/report-${KEPLER_TAG}.md) \|" \
	>> ${GITHUB_WORKSPACE}/docs/kepler-model-train-validate.md

	done

	cd ${GITHUB_WORKSPACE}
	git config user.email "dependabot[bot]@users.noreply.github.com"
	git config user.name "dependabot[bot]"
	git add docs/*
	git commit -m "Add train-validate-e2e single server for ${DATE_STR}" -s
	git pull --rebase
	git push
	env:
Check failure on line 120 in .github/workflows/equinix_metal_e2e.yml View workflow run for this annotation GitHub Actions / Training and Validation e2e with Single Server Invalid workflow file `The workflow is not valid. .github/workflows/equinix_metal_e2e.yml (Line: 120, Col: 9): 'env' is already defined`
	GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}



	Cleanup:
	name: "Cleanup"
	needs: [Validate]
	uses: ./.github/workflows/clean_equinix_runner.yml
	secrets: inherit
	with:
	runner_name: ${{ needs.Validate.outputs.runner-name }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Training and Validation e2e with Single Server #115

Workflow file

Training and Validation e2e with Single Server #115

Jobs

Run details

Workflow file for this run

GitHub Actions / Training and Validation e2e with Single Server