Skip to content

Commit

Permalink
[Inference] Enable vllm on HPU (intel#232)
Browse files Browse the repository at this point in the history
* enable vllm guadi

* fix ci

* fix ci

* enforce eager
  • Loading branch information
KepingYan authored May 24, 2024
1 parent b37f275 commit 0e2b8f8
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 13 deletions.
32 changes: 20 additions & 12 deletions .github/workflows/workflow_inference_gaudi2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
name: inference
strategy:
matrix:
model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf ]
model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf, llama-2-7b-chat-hf-vllm ]
isPR:
- ${{inputs.ci_type == 'pr'}}

Expand All @@ -38,6 +38,7 @@ jobs:
include:
- { model: "llama-2-7b-chat-hf"}
- { model: "llama-2-70b-chat-hf"}
- { model: "llama-2-7b-chat-hf-vllm"}

runs-on: gaudi2

Expand All @@ -63,6 +64,8 @@ jobs:
target="${target}_gaudi2"
elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
target="${target}_gaudi2"
elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
target="${target}_vllm_gaudi2"
fi
echo "target is ${target}"
echo "target=$target" >> $GITHUB_OUTPUT
Expand All @@ -77,7 +80,12 @@ jobs:
run: |
DF_SUFFIX=".gaudi2"
TARGET=${{steps.target.outputs.target}}
docker build --build-arg CACHEBUST=1 -f dev/docker/Dockerfile.habana -t ${TARGET}:habana .
if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
dockerfile="dev/docker/Dockerfile.habana_vllm"
else
dockerfile="dev/docker/Dockerfile.habana"
fi
docker build --build-arg CACHEBUST=1 -f ${dockerfile} -t ${TARGET}:habana .
docker container prune -f
docker image prune -f
Expand All @@ -103,25 +111,25 @@ jobs:
import yaml
if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml"
with open(conf_path, encoding="utf-8") as reader:
result = yaml.load(reader, Loader=yaml.FullLoader)
result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
with open(conf_path, 'w') as output:
yaml.dump(result, output, sort_keys=False)
elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"):
conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml"
with open(conf_path, encoding="utf-8") as reader:
result = yaml.load(reader, Loader=yaml.FullLoader)
result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
with open(conf_path, 'w') as output:
yaml.dump(result, output, sort_keys=False)
elif ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):
conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
with open(conf_path, encoding="utf-8") as reader:
result = yaml.load(reader, Loader=yaml.FullLoader)
result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
with open(conf_path, 'w') as output:
yaml.dump(result, output, sort_keys=False)
EOF
)
docker exec "${TARGET}" python -c "$CMD"
if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --keep_serve_terminal"
elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}"
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal"
fi
echo Streaming query:
docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response"
Expand Down
38 changes: 38 additions & 0 deletions dev/docker/Dockerfile.habana_vllm
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest

ENV LANG=en_US.UTF-8

WORKDIR /root/llm-on-ray

COPY ./pyproject.toml .
COPY ./MANIFEST.in .

# Create llm_on_ray package directory to bypass the following 'pip install -e' command
RUN mkdir ./llm_on_ray

RUN pip install -e . && \
pip install --upgrade-strategy eager optimum[habana] && \
pip install git+https://github.com/HabanaAI/[email protected]

# Install vllm habana env
RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@habana_main
# Reinstall ray because vllm downgrades the ray version
RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10"

# Optinal. Comment out if you are not using UI
COPY ./dev/scripts/install-ui.sh /tmp

RUN /tmp/install-ui.sh

RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
service ssh restart

ENV no_proxy=localhost,127.0.0.1

# Required by DeepSpeed
ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1

ENV PT_HPU_LAZY_ACC_PAR_MODE=0

ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

1 change: 1 addition & 0 deletions llm_on_ray/inference/inference_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def _check_precision(cls, v: str):
class Vllm(BaseModel):
enabled: bool = False
precision: str = "bf16"
enforce_eager: bool = False

@validator("precision")
def _check_precision(cls, v: str):
Expand Down
22 changes: 22 additions & 0 deletions llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
port: 8000
name: llama-2-7b-chat-hf-vllm
route_prefix: /llama-2-7b-chat-hf-vllm
num_replicas: 1
cpus_per_worker: 8
gpus_per_worker: 0
deepspeed: false
vllm:
enabled: true
precision: bf16
enforce_eager: true
workers_per_group: 2
device: hpu
hpus_per_worker: 1
ipex:
enabled: false
precision: bf16
model_description:
model_id_or_path: meta-llama/Llama-2-7b-chat-hf
tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
config:
use_auth_token: ''
2 changes: 1 addition & 1 deletion llm_on_ray/inference/predictor_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(
# Used to determine if openai backend is used
self.use_openai = False

if infer_conf.device == "hpu":
if infer_conf.device == "hpu" and not self.use_vllm:
from llm_on_ray.inference.predictors.hpu_predictor import HPUPredictor

self.predictor = HPUPredictor(infer_conf)
Expand Down
1 change: 1 addition & 0 deletions llm_on_ray/inference/predictors/vllm_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(self, infer_conf: InferenceConfig, max_num_seqs):
dtype=dtype,
disable_log_requests=True,
max_num_seqs=max_num_seqs,
enforce_eager=infer_conf.vllm.enforce_eager,
)

self.engine = AsyncLLMEngine.from_engine_args(args)
Expand Down

0 comments on commit 0e2b8f8

Please sign in to comment.