Skip to content

Training and Validation e2e with Single Server #115

Training and Validation e2e with Single Server

Training and Validation e2e with Single Server #115

name: Training and Validation e2e with Single Server
on:
workflow_dispatch:
inputs:
target_models:
description: 'Comma Separated List of Models to Train. Format: model_type/feature_type/trainer_name'
required: false
default: 'AbsPower/BPFOnly/SGDRegressorTrainer,AbsPower/BPFOnly/ExponentialRegressionTrainer,AbsPower/BPFOnly/LogarithmicRegressionTrainer,AbsPower/BPFOnly/PolynomialRegressionTrainer,AbsPower/BPFOnly/XgboostFitTrainer'
model_server_image:
description: 'Model Server Image to use for validation'
required: false
default: 'quay.io/sustainable_computing_io/kepler_model_server:latest'
permissions:
pull-requests: write
contents: write
repository-projects: write
packages: write
jobs:
Create-runner:
name: "Create Runner"
uses: ./.github/workflows/create_equinix_runner.yml
secrets: inherit
Validate:
name: "Validate"
needs: Create-runner
runs-on: self-hosted
continue-on-error: true
outputs:
runner-name: ${{ runner.name }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run Setup Runner Action
uses: ./.github/actions/setup-action
- name: Run Setup Playbooks
env:
MODEL_SERVER_IMAGE: ${{ github.event.inputs.model_server_image }}
run: |
cd ${GITHUB_WORKSPACE}/ansible
echo "Create VM"
ansible-playbook -i inventory.yml kvm_playbook.yml
echo "Install SSH tunnel"
ansible-playbook ssh_tunnel_playbook.yml
echo "Install Prometheus"
ansible-playbook -i inventory.yml metrics_playbook.yml
echo "Install Node Exporter"
ansible-playbook -i inventory.yml node_exporter_playbook.yml -vvv
echo "Verify node-exporter"
sudo systemctl status node_exporter || true
sudo ss -tuln | grep 9100 || true
curl -s localhost:9100/metrics | grep collector || true
echo "Install Kepler"
ansible-playbook -i inventory.yml -vvv kepler_playbook.yml
echo "Create ssh tunnel"
ansible-playbook -i inventory.yml ssh_tunnel_playbook.yml
echo "Install Model Server"
ansible-playbook -i inventory.yml -vvv model_server_playbook.yml
- name: Run Trainer Action
uses: ./.github/actions/train-action
with:
model_export_path: /tmp/trained-equinix-models
total_runtime_seconds: 300
- name: Checkout code
uses: actions/checkout@v4
- name: Run Validation Playbooks
env:
TOTAL_RUNTIME_SECONDS: 300
VALIDATOTR_CURVE_TYPE: "default"
run: |
cd ${GITHUB_WORKSPACE}/ansible
echo "Pass Trained Models to VM"
ansible-playbook -i inventory.yml -v deploy_http_model_server.yml
export DATE_STR=$(date +%Y-%m-%d_%H-%M-%S)
export DATE_STRING=$(date +%Y-%m-%d)
mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}
target_models_list="${{ github.event.inputs.target_models }}"
IFS=',' read -r -a models <<< "$target_models_list"
for model in "${models[@]}"
do
echo "Running Model Server Playbook: $model"
model_url="http://localhost:8080/${model}_0.zip"
echo "Model exists with sufficient accuracy: $model_url"
ansible-playbook -i inventory.yml -vvv model_server_restart.yml \
-e "node_components_init_url=$model_url"
echo "Run validation test"
ansible-playbook -vvv kepler_validator.yml
echo "Validation Finished"
export KEPLER_TAG=$(ls -d /tmp/validator-* |tail -1 | sed 's/.*validator-//g')
FILE="/tmp/validator-*/*"
mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model}
mv $FILE ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model}
echo "| " ${DATE_STRING} " | " ${model}-equinix " | [Report](train-validate-e2e/${DATE_STR}/${model}/report-${KEPLER_TAG}.md) |" \
>> ${GITHUB_WORKSPACE}/docs/kepler-model-train-validate.md
done
cd ${GITHUB_WORKSPACE}
git config user.email "dependabot[bot]@users.noreply.github.com"
git config user.name "dependabot[bot]"
git add docs/*
git commit -m "Add train-validate-e2e single server for ${DATE_STR}" -s
git pull --rebase
git push
env:

Check failure on line 120 in .github/workflows/equinix_metal_e2e.yml

View workflow run for this annotation

GitHub Actions / Training and Validation e2e with Single Server

Invalid workflow file

The workflow is not valid. .github/workflows/equinix_metal_e2e.yml (Line: 120, Col: 9): 'env' is already defined
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
Cleanup:
name: "Cleanup"
needs: [Validate]
uses: ./.github/workflows/clean_equinix_runner.yml
secrets: inherit
with:
runner_name: ${{ needs.Validate.outputs.runner-name }}