-
Notifications
You must be signed in to change notification settings - Fork 11
131 lines (110 loc) · 4.7 KB
/
equinix_metal_e2e.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
name: Training and Validation e2e with Single Server
on:
workflow_dispatch:
inputs:
target_models:
description: 'Comma Separated List of Models to Train. Format: model_type/feature_type/trainer_name'
required: false
default: 'AbsPower/BPFOnly/SGDRegressorTrainer,AbsPower/BPFOnly/ExponentialRegressionTrainer,AbsPower/BPFOnly/LogarithmicRegressionTrainer,AbsPower/BPFOnly/PolynomialRegressionTrainer,AbsPower/BPFOnly/XgboostFitTrainer'
model_server_image:
description: 'Model Server Image to use for validation'
required: false
default: 'quay.io/sustainable_computing_io/kepler_model_server:latest'
permissions:
pull-requests: write
contents: write
repository-projects: write
packages: write
jobs:
Create-runner:
name: "Create Runner"
uses: ./.github/workflows/create_equinix_runner.yml
secrets: inherit
Validate:
name: "Validate"
needs: Create-runner
runs-on: self-hosted
continue-on-error: true
outputs:
runner-name: ${{ runner.name }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run Setup Runner Action
uses: ./.github/actions/setup-action
- name: Run Setup Playbooks
env:
MODEL_SERVER_IMAGE: ${{ github.event.inputs.model_server_image }}
run: |
cd ${GITHUB_WORKSPACE}/ansible
echo "Create VM"
ansible-playbook -i inventory.yml kvm_playbook.yml
echo "Install SSH tunnel"
ansible-playbook ssh_tunnel_playbook.yml
echo "Install Prometheus"
ansible-playbook -i inventory.yml metrics_playbook.yml
echo "Install Node Exporter"
ansible-playbook -i inventory.yml node_exporter_playbook.yml -vvv
echo "Verify node-exporter"
sudo systemctl status node_exporter || true
sudo ss -tuln | grep 9100 || true
curl -s localhost:9100/metrics | grep collector || true
echo "Install Kepler"
ansible-playbook -i inventory.yml -vvv kepler_playbook.yml
echo "Create ssh tunnel"
ansible-playbook -i inventory.yml ssh_tunnel_playbook.yml
echo "Install Model Server"
ansible-playbook -i inventory.yml -vvv model_server_playbook.yml
- name: Run Trainer Action
uses: ./.github/actions/train-action
with:
model_export_path: /tmp/trained-equinix-models
total_runtime_seconds: 300
- name: Checkout code
uses: actions/checkout@v4
- name: Run Validation Playbooks
env:
TOTAL_RUNTIME_SECONDS: 300
VALIDATOTR_CURVE_TYPE: "default"
run: |
cd ${GITHUB_WORKSPACE}/ansible
echo "Pass Trained Models to VM"
ansible-playbook -i inventory.yml -v deploy_http_model_server.yml
export DATE_STR=$(date +%Y-%m-%d_%H-%M-%S)
export DATE_STRING=$(date +%Y-%m-%d)
mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}
target_models_list="${{ github.event.inputs.target_models }}"
IFS=',' read -r -a models <<< "$target_models_list"
for model in "${models[@]}"
do
echo "Running Model Server Playbook: $model"
model_url="http://localhost:8080/${model}_0.zip"
echo "Model exists with sufficient accuracy: $model_url"
ansible-playbook -i inventory.yml -vvv model_server_restart.yml \
-e "node_components_init_url=$model_url"
echo "Run validation test"
ansible-playbook -vvv kepler_validator.yml
echo "Validation Finished"
export KEPLER_TAG=$(ls -d /tmp/validator-* |tail -1 | sed 's/.*validator-//g')
FILE="/tmp/validator-*/*"
mkdir -p ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model}
mv $FILE ${GITHUB_WORKSPACE}/docs/train-validate-e2e/${DATE_STR}/${model}
echo "| " ${DATE_STRING} " | " ${model}-equinix " | [Report](train-validate-e2e/${DATE_STR}/${model}/report-${KEPLER_TAG}.md) |" \
>> ${GITHUB_WORKSPACE}/docs/kepler-model-train-validate.md
done
cd ${GITHUB_WORKSPACE}
git config user.email "dependabot[bot]@users.noreply.github.com"
git config user.name "dependabot[bot]"
git add docs/*
git commit -m "Add train-validate-e2e single server for ${DATE_STR}" -s
git pull --rebase
git push
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
Cleanup:
name: "Cleanup"
needs: [Validate]
uses: ./.github/workflows/clean_equinix_runner.yml
secrets: inherit
with:
runner_name: ${{ needs.Validate.outputs.runner-name }}