From 0e2b8f8a9ebaff3f8be55289a6b836ab08cab4ca Mon Sep 17 00:00:00 2001 From: KepingYan Date: Fri, 24 May 2024 15:24:07 +0800 Subject: [PATCH] [Inference] Enable vllm on HPU (#232) * enable vllm guadi * fix ci * fix ci * enforce eager --- .../workflows/workflow_inference_gaudi2.yml | 32 ++++++++++------ dev/docker/Dockerfile.habana_vllm | 38 +++++++++++++++++++ llm_on_ray/inference/inference_config.py | 1 + .../hpu/llama-2-7b-chat-hf-vllm-hpu.yaml | 22 +++++++++++ llm_on_ray/inference/predictor_deployment.py | 2 +- .../inference/predictors/vllm_predictor.py | 1 + 6 files changed, 83 insertions(+), 13 deletions(-) create mode 100644 dev/docker/Dockerfile.habana_vllm create mode 100644 llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 794b83181..588e8dab0 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf ] + model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -38,6 +38,7 @@ jobs: include: - { model: "llama-2-7b-chat-hf"} - { model: "llama-2-70b-chat-hf"} + - { model: "llama-2-7b-chat-hf-vllm"} runs-on: gaudi2 @@ -63,6 +64,8 @@ jobs: target="${target}_gaudi2" elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then target="${target}_gaudi2" + elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then + target="${target}_vllm_gaudi2" fi echo "target is ${target}" echo "target=$target" >> $GITHUB_OUTPUT @@ -77,7 +80,12 @@ jobs: run: | DF_SUFFIX=".gaudi2" TARGET=${{steps.target.outputs.target}} - docker build --build-arg CACHEBUST=1 -f dev/docker/Dockerfile.habana -t ${TARGET}:habana . + if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then + dockerfile="dev/docker/Dockerfile.habana_vllm" + else + dockerfile="dev/docker/Dockerfile.habana" + fi + docker build --build-arg CACHEBUST=1 -f ${dockerfile} -t ${TARGET}:habana . docker container prune -f docker image prune -f @@ -103,18 +111,15 @@ jobs: import yaml if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"): conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml" - with open(conf_path, encoding="utf-8") as reader: - result = yaml.load(reader, Loader=yaml.FullLoader) - result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" - with open(conf_path, 'w') as output: - yaml.dump(result, output, sort_keys=False) elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"): conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml" - with open(conf_path, encoding="utf-8") as reader: - result = yaml.load(reader, Loader=yaml.FullLoader) - result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" - with open(conf_path, 'w') as output: - yaml.dump(result, output, sort_keys=False) + elif ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"): + conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml" + with open(conf_path, encoding="utf-8") as reader: + result = yaml.load(reader, Loader=yaml.FullLoader) + result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" + with open(conf_path, 'w') as output: + yaml.dump(result, output, sort_keys=False) EOF ) docker exec "${TARGET}" python -c "$CMD" @@ -122,6 +127,9 @@ jobs: docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --keep_serve_terminal" elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal" + elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then + docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}" + docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal" fi echo Streaming query: docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response" diff --git a/dev/docker/Dockerfile.habana_vllm b/dev/docker/Dockerfile.habana_vllm new file mode 100644 index 000000000..4a2021411 --- /dev/null +++ b/dev/docker/Dockerfile.habana_vllm @@ -0,0 +1,38 @@ +FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + +ENV LANG=en_US.UTF-8 + +WORKDIR /root/llm-on-ray + +COPY ./pyproject.toml . +COPY ./MANIFEST.in . + +# Create llm_on_ray package directory to bypass the following 'pip install -e' command +RUN mkdir ./llm_on_ray + +RUN pip install -e . && \ + pip install --upgrade-strategy eager optimum[habana] && \ + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.1 + +# Install vllm habana env +RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@habana_main +# Reinstall ray because vllm downgrades the ray version +RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10" + +# Optinal. Comment out if you are not using UI +COPY ./dev/scripts/install-ui.sh /tmp + +RUN /tmp/install-ui.sh + +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + service ssh restart + +ENV no_proxy=localhost,127.0.0.1 + +# Required by DeepSpeed +ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1 + +ENV PT_HPU_LAZY_ACC_PAR_MODE=0 + +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index a731af55f..9d54d87a2 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -57,6 +57,7 @@ def _check_precision(cls, v: str): class Vllm(BaseModel): enabled: bool = False precision: str = "bf16" + enforce_eager: bool = False @validator("precision") def _check_precision(cls, v: str): diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml new file mode 100644 index 000000000..869f41d7a --- /dev/null +++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml @@ -0,0 +1,22 @@ +port: 8000 +name: llama-2-7b-chat-hf-vllm +route_prefix: /llama-2-7b-chat-hf-vllm +num_replicas: 1 +cpus_per_worker: 8 +gpus_per_worker: 0 +deepspeed: false +vllm: + enabled: true + precision: bf16 + enforce_eager: true +workers_per_group: 2 +device: hpu +hpus_per_worker: 1 +ipex: + enabled: false + precision: bf16 +model_description: + model_id_or_path: meta-llama/Llama-2-7b-chat-hf + tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index f5ac35d80..a1055915d 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -63,7 +63,7 @@ def __init__( # Used to determine if openai backend is used self.use_openai = False - if infer_conf.device == "hpu": + if infer_conf.device == "hpu" and not self.use_vllm: from llm_on_ray.inference.predictors.hpu_predictor import HPUPredictor self.predictor = HPUPredictor(infer_conf) diff --git a/llm_on_ray/inference/predictors/vllm_predictor.py b/llm_on_ray/inference/predictors/vllm_predictor.py index 4d0f6389a..d3d09414a 100644 --- a/llm_on_ray/inference/predictors/vllm_predictor.py +++ b/llm_on_ray/inference/predictors/vllm_predictor.py @@ -50,6 +50,7 @@ def __init__(self, infer_conf: InferenceConfig, max_num_seqs): dtype=dtype, disable_log_requests=True, max_num_seqs=max_num_seqs, + enforce_eager=infer_conf.vllm.enforce_eager, ) self.engine = AsyncLLMEngine.from_engine_args(args)