From 0e2b8f8a9ebaff3f8be55289a6b836ab08cab4ca Mon Sep 17 00:00:00 2001
From: KepingYan <keping.yan@intel.com>
Date: Fri, 24 May 2024 15:24:07 +0800
Subject: [PATCH] [Inference] Enable vllm on HPU (#232)

* enable vllm guadi

* fix ci

* fix ci

* enforce eager
---
 .../workflows/workflow_inference_gaudi2.yml   | 32 ++++++++++------
 dev/docker/Dockerfile.habana_vllm             | 38 +++++++++++++++++++
 llm_on_ray/inference/inference_config.py      |  1 +
 .../hpu/llama-2-7b-chat-hf-vllm-hpu.yaml      | 22 +++++++++++
 llm_on_ray/inference/predictor_deployment.py  |  2 +-
 .../inference/predictors/vllm_predictor.py    |  1 +
 6 files changed, 83 insertions(+), 13 deletions(-)
 create mode 100644 dev/docker/Dockerfile.habana_vllm
 create mode 100644 llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 794b83181..588e8dab0 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf ]
+        model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -38,6 +38,7 @@ jobs:
         include:
           - { model: "llama-2-7b-chat-hf"}
           - { model: "llama-2-70b-chat-hf"}
+          - { model: "llama-2-7b-chat-hf-vllm"}
 
     runs-on: gaudi2
 
@@ -63,6 +64,8 @@ jobs:
             target="${target}_gaudi2"
           elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
             target="${target}_gaudi2"
+          elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
+            target="${target}_vllm_gaudi2"
           fi
           echo "target is ${target}"
           echo "target=$target" >> $GITHUB_OUTPUT
@@ -77,7 +80,12 @@ jobs:
         run: |
           DF_SUFFIX=".gaudi2"
           TARGET=${{steps.target.outputs.target}}
-          docker build --build-arg CACHEBUST=1 -f dev/docker/Dockerfile.habana -t ${TARGET}:habana .
+          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
+            dockerfile="dev/docker/Dockerfile.habana_vllm"
+          else
+            dockerfile="dev/docker/Dockerfile.habana"
+          fi
+          docker build --build-arg CACHEBUST=1 -f ${dockerfile} -t ${TARGET}:habana .
           docker container prune -f
           docker image prune -f
 
@@ -103,18 +111,15 @@ jobs:
           import yaml
           if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
               conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml"
-              with open(conf_path, encoding="utf-8") as reader:
-                  result = yaml.load(reader, Loader=yaml.FullLoader)
-                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
-              with open(conf_path, 'w') as output:
-                  yaml.dump(result, output, sort_keys=False)
           elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"):
               conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml"
-              with open(conf_path, encoding="utf-8") as reader:
-                  result = yaml.load(reader, Loader=yaml.FullLoader)
-                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
-              with open(conf_path, 'w') as output:
-                  yaml.dump(result, output, sort_keys=False)
+          elif ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):   
+              conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
+          with open(conf_path, encoding="utf-8") as reader:
+              result = yaml.load(reader, Loader=yaml.FullLoader)
+              result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+          with open(conf_path, 'w') as output:
+              yaml.dump(result, output, sort_keys=False)
           EOF
           )
           docker exec "${TARGET}" python -c "$CMD"
@@ -122,6 +127,9 @@ jobs:
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --keep_serve_terminal"
           elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
+          elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
+            docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}"
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal" 
           fi
           echo Streaming query:
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response"
diff --git a/dev/docker/Dockerfile.habana_vllm b/dev/docker/Dockerfile.habana_vllm
new file mode 100644
index 000000000..4a2021411
--- /dev/null
+++ b/dev/docker/Dockerfile.habana_vllm
@@ -0,0 +1,38 @@
+FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+
+ENV LANG=en_US.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+
+# Create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+
+RUN pip install -e . && \
+    pip install --upgrade-strategy eager optimum[habana] && \
+    pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.1
+
+# Install vllm habana env
+RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@habana_main
+# Reinstall ray because vllm downgrades the ray version
+RUN pip install "ray>=2.10" "ray[serve,tune]>=2.10"
+
+# Optinal. Comment out if you are not using UI
+COPY ./dev/scripts/install-ui.sh /tmp
+
+RUN /tmp/install-ui.sh
+
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+    service ssh restart
+
+ENV no_proxy=localhost,127.0.0.1
+
+# Required by DeepSpeed
+ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1
+
+ENV PT_HPU_LAZY_ACC_PAR_MODE=0
+
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index a731af55f..9d54d87a2 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -57,6 +57,7 @@ def _check_precision(cls, v: str):
 class Vllm(BaseModel):
     enabled: bool = False
     precision: str = "bf16"
+    enforce_eager: bool = False
 
     @validator("precision")
     def _check_precision(cls, v: str):
diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml
new file mode 100644
index 000000000..869f41d7a
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml
@@ -0,0 +1,22 @@
+port: 8000
+name: llama-2-7b-chat-hf-vllm
+route_prefix: /llama-2-7b-chat-hf-vllm
+num_replicas: 1
+cpus_per_worker: 8
+gpus_per_worker: 0
+deepspeed: false
+vllm:
+  enabled: true
+  precision: bf16
+  enforce_eager: true
+workers_per_group: 2
+device: hpu
+hpus_per_worker: 1
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: meta-llama/Llama-2-7b-chat-hf
+  tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index f5ac35d80..a1055915d 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -63,7 +63,7 @@ def __init__(
         # Used to determine if openai backend is used
         self.use_openai = False
 
-        if infer_conf.device == "hpu":
+        if infer_conf.device == "hpu" and not self.use_vllm:
             from llm_on_ray.inference.predictors.hpu_predictor import HPUPredictor
 
             self.predictor = HPUPredictor(infer_conf)
diff --git a/llm_on_ray/inference/predictors/vllm_predictor.py b/llm_on_ray/inference/predictors/vllm_predictor.py
index 4d0f6389a..d3d09414a 100644
--- a/llm_on_ray/inference/predictors/vllm_predictor.py
+++ b/llm_on_ray/inference/predictors/vllm_predictor.py
@@ -50,6 +50,7 @@ def __init__(self, infer_conf: InferenceConfig, max_num_seqs):
             dtype=dtype,
             disable_log_requests=True,
             max_num_seqs=max_num_seqs,
+            enforce_eager=infer_conf.vllm.enforce_eager,
         )
 
         self.engine = AsyncLLMEngine.from_engine_args(args)