Training and Validation e2e with Single Server #115
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Training and Validation e2e with Single Server | ||
on: | ||
workflow_dispatch: | ||
inputs: | ||
target_models: | ||
description: 'Comma Separated List of Models to Train. Format: model_type/feature_type/trainer_name' | ||
required: false | ||
default: 'AbsPower/BPFOnly/SGDRegressorTrainer,AbsPower/BPFOnly/ExponentialRegressionTrainer,AbsPower/BPFOnly/LogarithmicRegressionTrainer,AbsPower/BPFOnly/PolynomialRegressionTrainer,AbsPower/BPFOnly/XgboostFitTrainer' | ||
model_server_image: | ||
description: 'Model Server Image to use for validation' | ||
required: false | ||
default: 'quay.io/sustainable_computing_io/kepler_model_server:latest' | ||
permissions: | ||
pull-requests: write | ||
contents: write | ||
repository-projects: write | ||
packages: write | ||
jobs: | ||
Create-runner: | ||
name: "Create Runner" | ||
uses: ./.github/workflows/create_equinix_runner.yml | ||
secrets: inherit | ||
Validate: | ||
name: "Validate" | ||
needs: Create-runner | ||
runs-on: self-hosted | ||
continue-on-error: true | ||
outputs: | ||
runner-name: ${{ runner.name }} | ||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
- name: Run Setup Runner Action | ||
uses: ./.github/actions/setup-action | ||
- name: Run Setup Playbooks | ||
env: | ||
MODEL_SERVER_IMAGE: ${{ github.event.inputs.model_server_image }} | ||
run: | | ||
cd ${GITHUB_WORKSPACE}/ansible | ||
echo "Create VM" | ||
ansible-playbook -i inventory.yml kvm_playbook.yml | ||
echo "Install SSH tunnel" | ||
ansible-playbook ssh_tunnel_playbook.yml | ||
echo "Install Prometheus" | ||
ansible-playbook -i inventory.yml metrics_playbook.yml | ||
echo "Install Node Exporter" | ||
ansible-playbook -i inventory.yml node_exporter_playbook.yml -vvv | ||
echo "Verify node-exporter" | ||
sudo systemctl status node_exporter || true | ||
sudo ss -tuln | grep 9100 || true | ||
curl -s localhost:9100/metrics | grep collector || true | ||
echo "Install Kepler" | ||
ansible-playbook -i inventory.yml -vvv kepler_playbook.yml | ||
echo "Create ssh tunnel" | ||
ansible-playbook -i inventory.yml ssh_tunnel_playbook.yml | ||
echo "Install Model Server" | ||
ansible-playbook -i inventory.yml -vvv model_server_playbook.yml | ||
- name: Run Trainer Action | ||
uses: ./.github/actions/train-action | ||
with: | ||
model_export_path: /tmp/trained-equinix-models | ||
total_runtime_seconds: 300 | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
- name: Run Validation Playbooks | ||
env: | ||
TOTAL_RUNTIME_SECONDS: 300 | ||
VALIDATOTR_CURVE_TYPE: "default" | ||
run: | | ||
cd ${GITHUB_WORKSPACE}/ansible | ||
echo "Pass Trained Models to VM" | ||
ansible-playbook -i inventory.yml -v deploy_http_model_server.yml | ||
export DATE_STR=$(date +%Y-%m-%d_%H-%M-%S) | ||
export DATE_STRING=$(date +%Y-%m-%d) | ||
mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR} | ||
target_models_list="${{ github.event.inputs.target_models }}" | ||
IFS=',' read -r -a models <<< "$target_models_list" | ||
for model in "${models[@]}" | ||
do | ||
echo "Running Model Server Playbook: $model" | ||
model_url="http://localhost:8080/${model}_0.zip" | ||
echo "Model exists with sufficient accuracy: $model_url" | ||
ansible-playbook -i inventory.yml -vvv model_server_restart.yml \ | ||
-e "node_components_init_url=$model_url" | ||
echo "Run validation test" | ||
ansible-playbook -vvv kepler_validator.yml | ||
echo "Validation Finished" | ||
export KEPLER_TAG=$(ls -d /tmp/validator-* |tail -1 | sed 's/.*validator-//g') | ||
FILE="/tmp/validator-*/*" | ||
mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model} | ||
mv $FILE ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model} | ||
echo "| " ${DATE_STRING} " | " ${model}-equinix " | [Report](train-validate-e2e/${DATE_STR}/${model}/report-${KEPLER_TAG}.md) |" \ | ||
>> ${GITHUB_WORKSPACE}/docs/kepler-model-train-validate.md | ||
done | ||
cd ${GITHUB_WORKSPACE} | ||
git config user.email "dependabot[bot]@users.noreply.github.com" | ||
git config user.name "dependabot[bot]" | ||
git add docs/* | ||
git commit -m "Add train-validate-e2e single server for ${DATE_STR}" -s | ||
git pull --rebase | ||
git push | ||
env: | ||
Check failure on line 120 in .github/workflows/equinix_metal_e2e.yml
|
||
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} | ||
Cleanup: | ||
name: "Cleanup" | ||
needs: [Validate] | ||
uses: ./.github/workflows/clean_equinix_runner.yml | ||
secrets: inherit | ||
with: | ||
runner_name: ${{ needs.Validate.outputs.runner-name }} |