-
Notifications
You must be signed in to change notification settings - Fork 30
156 lines (141 loc) · 6.27 KB
/
workflow_inference_gaudi2.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
name: Inference
on:
workflow_call:
inputs:
ci_type:
type: string
default: 'pr'
runner_container_image:
type: string
default: '127.0.0.1:5000/llmray-build'
runner_config_path:
type: string
default: '/home/ci/llm-ray-actions-runner'
code_checkout_path:
type: string
default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
model_cache_path:
type: string
default: '/scratch-2/huggingface/cache'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-gaudi2
cancel-in-progress: true
permissions: # added using https://github.com/step-security/secure-repo
contents: read
jobs:
inference:
name: inference
strategy:
matrix:
model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, mistral-7b-v0.1, mpt-7b, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ]
isPR:
- ${{inputs.ci_type == 'pr'}}
exclude:
- { isPR: true }
include:
- { model: "bloom-7b1"}
- { model: "CodeLlama-7b-hf"}
- { model: "falcon-7b"}
- { model: "falcon-40b"}
- { model: "gemma-2b"}
- { model: "gpt-j-6b"}
- { model: "gpt2"}
- { model: "llama-2-7b-chat-hf"}
- { model: "llama-2-70b-chat-hf"}
- { model: "meta-llama-3-8b-instruct"}
- { model: "meta-llama-3-70b-instruct"}
- { model: "mpt-7b"}
- { model: "mistral-7b-v0.1"}
- { model: "Qwen2-7B-Instruct"}
- { model: "llama-2-7b-chat-hf-vllm"}
runs-on: gaudi2
defaults:
run:
shell: bash
container:
image: ${{ inputs.runner_container_image }}
env:
SHELL: bash -eo pipefail
http_proxy:
https_proxy:
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ${{ inputs.runner_config_path }}:/root/actions-runner-config
steps:
- name: Determine Target
id: "target"
run: |
target="inference"
if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
target="${target}_vllm_gaudi2"
else
target="${target}_gaudi2"
fi
echo "target is ${target}"
echo "target=$target" >> $GITHUB_OUTPUT
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
- name: Build Docker Image
run: |
DF_SUFFIX=".gaudi2"
TARGET=${{steps.target.outputs.target}}
if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
dockerfile="dev/docker/ci/Dockerfile.habana_vllm"
else
dockerfile="dev/docker/ci/Dockerfile.habana"
fi
docker build --build-arg CACHEBUST=1 -f ${dockerfile} -t ${TARGET}:habana .
docker container prune -f
docker image prune -f
- name: Start Docker Container
run: |
TARGET=${{steps.target.outputs.target}}
cid=$(docker ps -q --filter "name=${TARGET}")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
# check and remove exited container
cid=$(docker ps -a -q --filter "name=${TARGET}")
if [[ ! -z "$cid" ]]; then docker rm $cid; fi
docker run -tid --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
- name: Start Ray Cluster
run: |
TARGET=${{steps.target.outputs.target}}
docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"
- name: Run Inference Test
run: |
TARGET=${{steps.target.outputs.target}}
CMD=$(cat << EOF
import yaml
conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml"
if ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):
conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
with open(conf_path, encoding="utf-8") as reader:
result = yaml.load(reader, Loader=yaml.FullLoader)
with open(conf_path, 'w') as output:
yaml.dump(result, output, sort_keys=False)
EOF
)
docker exec "${TARGET}" python -c "$CMD"
if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --keep_serve_terminal"
elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal"
else
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal"
fi
echo Streaming query:
docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response"
- name: Stop Ray
run: |
TARGET=${{steps.target.outputs.target}}
cid=$(docker ps -q --filter "name=${TARGET}")
if [[ ! -z "$cid" ]]; then
docker exec "${TARGET}" bash -c "ray stop"
fi
- name: Stop Container
if: success() || failure()
run: |
TARGET=${{steps.target.outputs.target}}
cid=$(docker ps -q --filter "name=${TARGET}")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi