[Bigdl] Change all "bigdl" to "ipexllm" (#189)

* bigdl2ipexllm * bigdl2ipexllm * bigdl2ipexllm * fix original name
intel · Apr 17, 2024 · 69f9824 · 69f9824
1 parent 3670edf
commit 69f9824
Show file tree

Hide file tree

Showing 13 changed files with 148 additions and 30 deletions.
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, gemma-2b]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, gemma-2b]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -44,7 +44,7 @@ jobs:
         include:
           - { model: "gpt-j-6b"}
           - { model: "mistral-7b-v0.1"}
-          - { model: "mpt-7b-bigdl"}
+          - { model: "mpt-7b-ipex-llm"}
           - { model: "llama-2-7b-chat-hf-vllm"}
           - { model: "gemma-2b"}
           - dtuner_model: nathan0/mpt-7b-deltatuner-model
@@ -76,8 +76,8 @@ jobs:
         id: "target"
         run: |
           target="inference"
-          if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            target="${target}_bigdl_cpu"
+          if [[ ${{ matrix.model }} == "mpt-7b-ipex-llm" ]]; then
+            target="${target}_ipex-llm"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             target="${target}_vllm"
           fi
@@ -86,8 +86,8 @@ jobs:
 
       - name: Build Docker Image
         run: |
-          if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            DF_SUFFIX=".bigdl-cpu"
+          if [[ ${{ matrix.model }} == "mpt-7b-ipex-llm" ]]; then
+            DF_SUFFIX=".ipex-llm"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             DF_SUFFIX=".vllm"
           elif [[ ${{ matrix.model }} == "gpt-j-6b" ]]; then
@@ -144,8 +144,8 @@ jobs:
           EOF
           )
           docker exec "${TARGET}" python -c "$CMD"
-          if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
+          if [[ ${{ matrix.model }} == "mpt-7b-ipex-llm" ]]; then
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml --simple"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file .github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml --simple"
           else
@@ -191,8 +191,8 @@ jobs:
       - name: Run Inference Test with REST API
         run: |
           TARGET=${{steps.target.outputs.target}}
-          if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
-            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/bigdl/mpt-7b-bigdl.yaml"
+          if [[ ${{ matrix.model }} == "mpt-7b-ipex-llm" ]]; then
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml"
           elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --models ${{ matrix.model }}"
             docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }}"
@@ -219,4 +219,4 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           cid=$(docker ps -q --filter "name=${TARGET}")
-          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
+          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ LLM-on-Ray is a comprehensive solution designed to empower users in building, cu
 
 LLM-on-Ray harnesses the power of Ray, an industry-leading framework for distributed computing, to scale your AI workloads efficiently. This integration ensures robust fault tolerance and cluster resource management, making your LLM projects more resilient and scalable.
 
-LLM-on-Ray is built to operate across various hardware setups, including Intel CPU, Intel GPU and Intel Gaudi2. It incorporates several industry and Intel optimizations to maximize performance, including [vLLM](https://github.com/vllm-project/vllm), [llama.cpp](https://github.com/ggerganov/llama.cpp), [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch)/[DeepSpeed](https://github.com/intel/intel-extension-for-deepspeed), [BigDL-LLM](https://github.com/intel-analytics/BigDL), [RecDP-LLM](https://github.com/intel/e2eAIOK/tree/main/RecDP/pyrecdp/LLM), [NeuralChat](https://huggingface.co/Intel/neural-chat-7b-v3-1) and more.
+LLM-on-Ray is built to operate across various hardware setups, including Intel CPU, Intel GPU and Intel Gaudi2. It incorporates several industry and Intel optimizations to maximize performance, including [vLLM](https://github.com/vllm-project/vllm), [llama.cpp](https://github.com/ggerganov/llama.cpp), [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch)/[DeepSpeed](https://github.com/intel/intel-extension-for-deepspeed), [IPEX-LLM](https://github.com/intel-analytics/ipex-llm), [RecDP-LLM](https://github.com/intel/e2eAIOK/tree/main/RecDP/pyrecdp/LLM), [NeuralChat](https://huggingface.co/Intel/neural-chat-7b-v3-1) and more.
 
 ## Solution Technical Overview
 LLM-on-Ray's modular workflow structure is designed to comprehensively cater to the various stages of LLM development, from pretraining and finetuning to serving. These workflows are intuitive, highly configurable, and tailored to meet the specific needs of each phase in the LLM lifecycle:
@@ -115,7 +115,7 @@ The following are detailed guidelines for pretraining, finetuning and serving LL
 ### Serving
 * [Deploy and Serve LLMs on Intel CPU/GPU/Gaudi](docs/serve.md)
 * [Deploy and Serve LLMs with DeepSpeed](docs/serve_deepspeed.md)
-* [Deploy and Serve LLMs with BigDL-LLM](docs/serve_bigdl.md)
+* [Deploy and Serve LLMs with IPEX-LLM](docs/serve_ipex-llm.md)
 
 ### Web UI
 * [Finetune and Deploy LLMs through Web UI](docs/web_ui.md)

diff --git a/dev/docker/Dockerfile.ipex-llm b/dev/docker/Dockerfile.ipex-llm
@@ -0,0 +1,39 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
+    /bin/bash ~/miniconda.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    conda config --add channels intel && \
+    conda install python==3.9
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[ipex-llm] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
+ARG CACHEBUST=1
+COPY ./dev/scripts/install-oneapi.sh /tmp
+RUN /tmp/install-oneapi.sh
diff --git a/docs/serve.md b/docs/serve.md
@@ -24,7 +24,7 @@ To deploy on Gaudi, please make sure `device` is set to hpu and `hpus_per_worker
 hpus_per_worker: 1
 device: hpu
 ```
-LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [BigDL-LLM](serve_bigdl.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them.
+LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [IPEX-LLM](serve_ipex-llm.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them.
 
 ## Serving
 We support three methods to specify the models to be served, and they have the following priorities.

diff --git a/docs/serve_ipex-llm.md b/docs/serve_ipex-llm.md
@@ -0,0 +1,22 @@
+## Deploying and Serving LLMs with IPEX-LLM
+[IPEX-LLM](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/index.html) is a library for running LLM (large language model) on Intel XPU (from Laptop to GPU to Cloud) using INT4 with very low latency (for any PyTorch model).
+
+The integration with IPEX-LLM currently only supports running on Intel CPU.
+
+## Setup
+Please follow [setup.md](setup.md) to setup the environment first. Additional, you will need to install IPEX-LLM dependencies as below.
+```bash
+pip install .[ipex-llm] --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+```
+
+## Configure Serving Parameters
+Please follow the serving [document](serve.md#configure-deploying-parameters) for configuring the parameters. In the configuration file, you need to set `ipex-llm` and `load_in_4bit` to true. Example configuration files for enalbing ipex-llm are availabe [here].(../inference/models/ipex-llm)
+
+```bash
+  ipexllm: true
+  config:
+    load_in_4bit: true
+```
+
+## Deploy and Test
+Please follow the serving [document](serve.md#deploy-the-model) for deploying and testing.
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
@@ -65,8 +65,8 @@ def _check_precision(cls, v: str):
         return v
 
 
-# for bigdl model
-class BigDLModelConfig(BaseModel):
+# for IPEX-LLM model
+class IpexllmModelConfig(BaseModel):
     load_in_low_bit: str = ""
 
     @validator("load_in_low_bit")
@@ -98,8 +98,8 @@ class ModelDescription(BaseModel):
     peft_model_id_or_path: Union[str, None] = None
     peft_type: Union[str, None] = None
 
-    bigdl: bool = False
-    bigdl_config: BigDLModelConfig = BigDLModelConfig()
+    ipexllm: bool = False
+    ipexllm_config: IpexllmModelConfig = IpexllmModelConfig()
 
     # only effective when device is hpu
     use_hpu_graphs: bool = True

diff --git a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
@@ -0,0 +1,25 @@
+port: 8000
+name: mistral-7b-v0.1-ipex-llm
+route_prefix: /mistral-7b-v0.1-ipex-llm
+num_replicas: 1
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: mistralai/Mistral-7B-v0.1
+  ipexllm: true
+  tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '<s>[INST] {msg} [/INST]'
+    bot_id: ''
+    stop_words: []
+  config:
+    trust_remote_code: true
+    load_in_4bit: true
diff --git a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml
@@ -0,0 +1,32 @@
+port: 8000
+name: mpt-7b-ipex-llm
+route_prefix: /mpt-7b-ipex-llm
+num_replicas: 1
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+device: cpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: mosaicml/mpt-7b-chat
+  ipexllm: true
+  tokenizer_name_or_path: EleutherAI/gpt-neox-20b
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: 'Below is an instruction that describes a task, paired with an input that
+      provides further context. Write a response that appropriately completes the request.
+
+      '
+    human_id: '
+
+      ### Instruction'
+    bot_id: '
+
+      ### Response'
+    stop_words: []
+  config:
+    trust_remote_code: true
+    load_in_4bit: true
diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
@@ -11,7 +11,7 @@ ipex:
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2
-  bigdl: false
+  ipexllm: false
   tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2
   chat_processor: ChatModelLLama
   prompt:

diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -12,7 +12,7 @@ ipex:
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
-  bigdl: false
+  ipexllm: false
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
   chat_processor: ChatModelLLama
   prompt:

diff --git a/llm_on_ray/inference/models/template/inference_config_template.yaml b/llm_on_ray/inference/models/template/inference_config_template.yaml
@@ -13,7 +13,7 @@ ipex:
   precision: bf16
 model_description:
   model_id_or_path: null
-  bigdl:: false
+  ipexllm:: false
   tokenizer_name_or_path: null
   chat_processor: null
   gpt_base_model: false
@@ -31,5 +31,5 @@ model_description:
     trust_remote_code: false
     use_auth_token: null
     load_in_4bit: false
-  bigdl_config:
+  ipexllm_config:
     load_in_low_bit: ''
diff --git a/llm_on_ray/inference/transformer_predictor.py b/llm_on_ray/inference/transformer_predictor.py
@@ -36,16 +36,16 @@ def __init__(self, infer_conf: InferenceConfig):
 
         # decide correct torch type for loading HF model
         decide_torch_dtype(infer_conf, hf_config)
-        if model_desc.bigdl:
-            from bigdl.llm.transformers import (
-                AutoModelForCausalLM as BigDLAutoModelForCLM,
+        if model_desc.ipexllm:
+            from ipex_llm.transformers import (
+                AutoModelForCausalLM as IpexllmAutoModelForCLM,
             )
 
             bmodel_config = {}
             bmodel_config.update(model_config.dict())
-            if model_desc.bigdl_config.load_in_low_bit:
-                bmodel_config.update(model_desc.bigdl_config.dict())
-            model = BigDLAutoModelForCLM.from_pretrained(
+            if model_desc.ipexllm_config.load_in_low_bit:
+                bmodel_config.update(model_desc.ipexllm_config.dict())
+            model = IpexllmAutoModelForCLM.from_pretrained(
                 model_desc.model_id_or_path,
                 config=hf_config,
                 low_cpu_mem_usage=True,

diff --git a/pyproject.toml b/pyproject.toml
@@ -64,8 +64,8 @@ deepspeed = [
     "deepspeed>=0.10.2, <0.11.2"
 ]
 
-bigdl-cpu = [
-    "bigdl-llm[all]==2.5.0b20240222"
+ipex-llm = [
+    "ipex-llm[all]==2.1.0b20240408"
 ]