intel · xwu99 · Jul 18, 2024 · May 13, 2024 · May 13, 2024 · May 14, 2024
diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
@@ -81,9 +81,9 @@ jobs:
           DF_SUFFIX=".gaudi2"
           TARGET=${{steps.target.outputs.target}}
           if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
-            dockerfile="dev/docker/Dockerfile.habana_vllm"
+            dockerfile="dev/docker/ci/Dockerfile.habana_vllm"
           else
-            dockerfile="dev/docker/Dockerfile.habana"
+            dockerfile="dev/docker/ci/Dockerfile.habana"
           fi
           docker build --build-arg CACHEBUST=1 -f ${dockerfile} -t ${TARGET}:habana .
           docker container prune -f

diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml
@@ -68,7 +68,7 @@ jobs:
         run: |
           DF_SUFFIX=".vllm"
           TARGET=${{steps.target.outputs.target}}
-          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest 
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/ci/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest 
           docker container prune -f
           docker image prune -f
 

diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ LLM-on-Ray's modular workflow structure is designed to comprehensively cater to
 ![llm-on-ray](./docs/assets/solution_technical_overview.png)
 
 
-## Getting Started
+## Getting Started Locally With Source code  
 This guide will assist you in setting up LLM-on-Ray on Intel CPU locally, covering the initial setup, finetuning models, and deploying them for serving.
 ### Setup
 
@@ -102,6 +102,61 @@ After deploying the model endpoint, you can access and test it by using the scri
 python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/gpt2
 ```
 
+## Getting Started With Docker
+This guide will assist you in setting up LLM-on-Ray on With Docker.
+```bash
+git clone https://github.com/intel/llm-on-ray.git
+cd llm-on-ray
+```
+The dockerfile for user is in dev/docker/Dockerfile.user.
+Detailed parameter can be set up for docker in dev/scripts/start_with_docker.sh.
+```bash
+##Set Your proxy and cache path here
+HTTP_PROXY='Your proxy'
+HTTPS_PROXY='Your proxy'
+HF_TOKEN='Your hf_token'
+code_checkout_path='If you need to use the modified llm-on-ray repository, define your path here'
+model_cache_path='If you need to use huggingface model cache, define your path here'
+```
+
+#### 1. Build Docker Image  
+Software requirement: Ubuntu and Docker
+```bash
+## If you need to use proxy, please change any settings in 'dev/scripts/start_with_docker.sh'
+source dev/scripts/start_with_docker.sh
+## Docker flie path is 'dev/docker/Dockerfile.user'.
+build_docker ## Use default cpu and deepspeed for llm serving
+```
+
+Change build_docker fuction's args for different environment
+```bash
+build_docker vllm ## use vllm for llm serving
+build_docker ipex-llm ## use ipex-vllm for llm serving
+```
+
+#### 2. Start Docker
+```bash
+## If you need to use the modified llm-on-ray repository or model cache path
+## please change any settings in 'dev/scripts/start_with_docker.sh'
+
+start_docker ## Run docker with default-model(gpt2) serving
+start_docker {Supported models,gpt-j-6b/llama-2-7b-chat-hf/gemma-2b,etc.} ## Run docker with other model serving
+
+## You can mount your own repositories and modify the model config file to support more models
+```
+
+#### 3. Start LLM-on-Ray
+```bash
+## Access and test model Same as start with source code
+# using requests library
+docker exec serving bash -c "python examples/inference/api_server_openai/query_http_requests.py"
+# using OpenAI SDK
+docker exec serving bash -c "pip install openai>=1.0"
+docker exec serving bash -c "export OPENAI_BASE_URL=http://localhost:8000/v1"
+docker exec serving bash -c "export OPENAI_API_KEY="not_a_real_key""
+docker exec serving bash -c "python examples/inference/api_server_openai/query_openai_sdk.py"
+```
+
 ## Documents
 The following are detailed guidelines for pretraining, finetuning and serving LLMs in various computing environment.
 

diff --git a/dev/docker/Dockerfile.user b/dev/docker/Dockerfile.user
@@ -0,0 +1,52 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+# Define build arguments
+ARG DOCKER_NAME=default
+ARG PYPJ=default
+ENV LANG C.UTF-8
+
+WORKDIR /root/
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    mamba config --add channels intel && \
+    mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
+
+# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
+ARG CACHEBUST=1
+
+RUN git clone https://github.com/intel/llm-on-ray.git
+RUN if [ -d "llm-on-ray" ]; then echo "Clone successful"; else echo "Clone failed" && exit 1; fi
+WORKDIR /root/llm-on-ray
+
+RUN git fetch origin pull/219/head:pr-219 && \
+    git checkout pr-219
+
+RUN ls -la
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[${PYPJ}] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+# Use shell scripting to conditionally install packages
+RUN if [ "${DOCKER_NAME}" = ".cpu_and_deepspeed" ]; then ds_report && ./dev/scripts/install-oneapi.sh;fi
+RUN if [ "${DOCKER_NAME}" = ".ipex-llm" ]; then ./dev/scripts/install-oneapi.sh; fi
+RUN if [ "${DOCKER_NAME}" = ".vllm" ]; then  ./dev/scripts/install-vllm-cpu.sh; fi
+
+
+RUN chmod +x ./dev/scripts/entrypoint_user.sh
+ENTRYPOINT ["./dev/scripts/entrypoint_user.sh"]
diff --git a/dev/docker/README.md b/dev/docker/README.md
@@ -1 +1,8 @@
-Dockerfiles for CI tests. There could be one Dockerfile with ARG declared to distinguish different pip extras. However, ARG will bust cache of 'pip install', which usually takes long time, when build docker image. Instead, we have two almost identical Dockerfiles here to improve CI efficiency. 
+Dockerfiles for users to convenient build containers. 
+1.Dockerfile.user for user to build llm-on-ray with docker on Intel CPU.
+2.Dockerfile.habana for user to build llm-on-ray with docker on Intel GPU .
+
+Dockerfiles for CI tests in 'ci/*'. 
+In CI, the environment required by different models is separated, and the dockerfiles with different functions are distinguished by different suffixes.
+
+There could be one Dockerfile with ARG declared to distinguish different pip extras. However, ARG will bust cache of 'pip install', which usually takes long time, when build docker image. Instead, we have two almost identical Dockerfiles here to improve CI efficiency. 
diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed → dev/docker/ci/Dockerfile.cpu_and_deepspeed b/dev/docker/Dockerfile.cpu_and_deepspeed → dev/docker/ci/Dockerfile.cpu_and_deepspeed
diff --git a/...erfile.cpu_and_deepspeed.pip_non_editable → ...erfile.cpu_and_deepspeed.pip_non_editable b/...erfile.cpu_and_deepspeed.pip_non_editable → ...erfile.cpu_and_deepspeed.pip_non_editable
diff --git a/dev/docker/ci/Dockerfile.habana b/dev/docker/ci/Dockerfile.habana
@@ -0,0 +1,32 @@
+FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+
+ENV LANG=en_US.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+
+RUN pip install -e . && \
+    pip install --upgrade-strategy eager optimum[habana] && \
+    pip install git+https://github.com/HabanaAI/[email protected]
+
+# Optinal. Comment out if you are not using UI
+COPY ./dev/scripts/install-ui.sh /tmp
+
+RUN /tmp/install-ui.sh
+
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+    service ssh restart
+
+ENV no_proxy=localhost,127.0.0.1
+
+# Required by DeepSpeed
+ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1
+
+ENV PT_HPU_LAZY_ACC_PAR_MODE=0
+
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
diff --git a/dev/docker/Dockerfile.habana_vllm → dev/docker/ci/Dockerfile.habana_vllm b/dev/docker/Dockerfile.habana_vllm → dev/docker/ci/Dockerfile.habana_vllm
diff --git a/dev/docker/Dockerfile.ipex-llm → dev/docker/ci/Dockerfile.ipex-llm b/dev/docker/Dockerfile.ipex-llm → dev/docker/ci/Dockerfile.ipex-llm
diff --git a/dev/docker/Dockerfile.tests_cpu → dev/docker/ci/Dockerfile.tests_cpu b/dev/docker/Dockerfile.tests_cpu → dev/docker/ci/Dockerfile.tests_cpu
diff --git a/...docker/Dockerfile.tests_cpu_and_deepspeed → ...ker/ci/Dockerfile.tests_cpu_and_deepspeed b/...docker/Dockerfile.tests_cpu_and_deepspeed → ...ker/ci/Dockerfile.tests_cpu_and_deepspeed
diff --git a/dev/docker/Dockerfile.vllm → dev/docker/ci/Dockerfile.vllm b/dev/docker/Dockerfile.vllm → dev/docker/ci/Dockerfile.vllm
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
@@ -26,10 +26,10 @@ build_and_prune() {
     fi
 
     echo "Build Docker image and perform cleaning operation"
-    echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes | docker image prune -f"
+    echo "docker build ./ ${docker_args[@]} -f dev/docker/ci/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes | docker image prune -f"
 
     # Build Docker image and perform cleaning operation
-    docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes
+    docker build ./ "${docker_args[@]}" -f dev/docker/ci/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes
     docker image prune -f
 
 }

diff --git a/dev/scripts/entrypoint_user.sh b/dev/scripts/entrypoint_user.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -e
+
+# Check if an environment variable exists and print its value
+if [ -n "$hf_token" ]; then
+  echo "The hf_token environment variable is: $hf_token"
+  # Execute Hugging Face CLI login command
+  huggingface-cli login --token "${hf_token}"
+else
+  echo "Environment variable 'hf_token' is not set."
+fi
+
+# Default serve cmd
+if ! pgrep -f 'ray'; then
+    echo "Ray is not running. Starting Ray..."
+    # start Ray
+    ray start --head
+    echo "Ray started."
+else
+    echo "Ray is already running."
+fi
+
+if [ -n "$model_name" ]; then
+    echo "Using User Model: $model_name"
+    llm_on_ray-serve --models $model_name
+else
+    echo "Using Default Model: gpt2"
+    llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml 
+fi
+
+#Keep the service not be exited
+tail -f /dev/null
diff --git a/dev/scripts/start_with_docker.sh b/dev/scripts/start_with_docker.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+##Set Your proxy and cache path here
+HTTP_PROXY='Your proxy'
+HTTPS_PROXY='Your proxy'
+HF_TOKEN='Your hf_token'
+code_checkout_path='If you need to use the modified llm-on-ray repository, define your path here'
+model_cache_path='If you need to use huggingface model cache, define your path here'
+MODEL_CACHE_PATH_LOACL='/root/.cache/huggingface/hub'
+CODE_CHECKOUT_PATH_LOCAL='/root/llm-on-ray'
+
+
+build_docker() {
+    local DOCKER_NAME=$1
+
+    docker_args=()
+    docker_args+=("--build-arg=CACHEBUST=1")
+    if [ "$DOCKER_NAME" == "vllm" ]; then
+        docker_args+=("--build-arg=DOCKER_NAME=".vllm"")
+        docker_args+=("--build-arg=PYPJ="vllm"")
+    elif [ "$DOCKER_NAME" == "ipex-llm" ]; then
+        docker_args+=("--build-arg=DOCKER_NAME=".ipex-llm"")
+        docker_args+=("--build-arg=PYPJ="ipex-llm"")
+    else 
+        docker_args+=("--build-arg=DOCKER_NAME=".cpu_and_deepspeed"")
+        docker_args+=("--build-arg=PYPJ="cpu,deepspeed"")
+    fi
+
+    # # If you need to use proxy,activate the following two lines
+    # docker_args+=("--build-arg=http_proxy=${HTTP_PROXY}")
+    # docker_args+=("--build-arg=https_proxy=${HTTPS_PROXY}")
+
+
+    echo "Build Docker image and perform cleaning operation"
+    echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile.user -t serving:latest"
+
+    # Build Docker image and perform cleaning operation
+    docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile.user -t serving:latest 
+
+}
+
+start_docker() {
+    local MODEL_NAME=$1
+
+    docker_args=()
+    docker_args+=("--name=serving" )
+    docker_args+=("-e=hf_token=${HF_TOKEN}")
+    if [ -z "$MODEL_NAME" ];  then
+        echo "use default model"
+    else
+        docker_args+=("-e=model_name=${MODEL_NAME}")
+    fi
+
+    # # If you need to use proxy,activate the following two lines
+    # docker_args+=("-e=http_proxy=${HTTP_PROXY}")
+    # docker_args+=("-e=https_proxy=${HTTPS_PROXY}")
+
+    # # If you need to use the modified llm-on-ray repository or huggingface model cache, activate the corresponding row
+    # docker_args+=("-v=${code_checkout_path}:${CODE_CHECKOUT_PATH_LOCAL}")
+    # docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}")
+
+    echo "docker run -tid  "${docker_args[@]}" "serving:latest""
+    docker run -tid  "${docker_args[@]}" "serving:latest"
+
+}
diff --git a/dev/scripts/start_with_docker_test.sh b/dev/scripts/start_with_docker_test.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+##Set Your proxy and cache path here
+HTTP_PROXY='http://10.24.221.169:911'
+HTTPS_PROXY='http://10.24.221.169:911'
+HF_TOKEN='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'
+code_checkout_path='/home/yutianchen/Project/pr_lib/llm-on-ray'
+model_cache_path='/home/yutianchen/.cache/huggingface/hub'
+MODEL_CACHE_PATH_LOACL='/root/.cache/huggingface/hub'
+CODE_CHECKOUT_PATH_LOCAL='/root/llm-on-ray'
+
+
+build_docker() {
+    local DOCKER_NAME=$1
+
+    docker_args=()
+    docker_args+=("--build-arg=CACHEBUST=1")
+    if [ "$DOCKER_NAME" == "vllm" ]; then
+        docker_args+=("--build-arg=DOCKER_NAME=".vllm"")
+        docker_args+=("--build-arg=PYPJ="vllm"")
+    elif [ "$DOCKER_NAME" == "ipex-llm" ]; then
+        docker_args+=("--build-arg=DOCKER_NAME=".ipex-llm"")
+        docker_args+=("--build-arg=PYPJ="ipex-llm"")
+    else 
+        docker_args+=("--build-arg=DOCKER_NAME=".cpu_and_deepspeed"")
+        docker_args+=("--build-arg=PYPJ="cpu,deepspeed"")
+    fi
+
+    # # If you need to use proxy,activate the following two lines
+    docker_args+=("--build-arg=http_proxy=${HTTP_PROXY}")
+    docker_args+=("--build-arg=https_proxy=${HTTPS_PROXY}")
+
+
+    echo "Build Docker image and perform cleaning operation"
+    echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile.user -t serving:latest"
+
+    # Build Docker image and perform cleaning operation
+    # docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile.user -t serving:latest 
+
+}
+
+start_docker() {
+    local MODEL_NAME=$1
+
+    docker_args=()
+    docker_args+=("--name=serving" )
+
+    docker_args+=("-e=hf_token=${HF_TOKEN}")
+    if [ -z "$MODEL_NAME" ];  then
+        echo "use default model"
+    else
+        docker_args+=("-e=model_name=${MODEL_NAME}")
+    fi
+
+    # # If you need to use proxy,activate the following two lines
+    docker_args+=("-e=http_proxy=${HTTP_PROXY}")
+    docker_args+=("-e=https_proxy=${HTTPS_PROXY}")
+
+    # # If you need to use the modified llm-on-ray repository or huggingface model cache, activate the corresponding row
+    docker_args+=("-v=${code_checkout_path}:${CODE_CHECKOUT_PATH_LOCAL}")
+    docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}")
+
+    echo "docker run -tid  "${docker_args[@]}" "serving:latest""
+    docker run -tid  "${docker_args[@]}" "serving:latest"
+
+}