diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml index d9a41398d..9671329b4 100644 --- a/.github/workflows/config/mpt_deltatuner.yaml +++ b/.github/workflows/config/mpt_deltatuner.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: true precision: bf16 diff --git a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml index 227f79cc1..bbe168cb2 100644 --- a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml +++ b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: true workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: false precision: bf16 diff --git a/docs/serve.md b/docs/serve.md index 28a8c9acf..6bf655b19 100644 --- a/docs/serve.md +++ b/docs/serve.md @@ -12,17 +12,17 @@ We provide preconfigured yaml files in [inference/models](../inference/models) f To deploy on CPU, please make sure `device` is set to CPU and `cpus_per_worker` is set to a correct number. ``` cpus_per_worker: 24 -device: "cpu" +device: CPU ``` To deploy on GPU, please make sure `device` is set to GPU and `gpus_per_worker` is set to 1. ``` gpus_per_worker: 1 -device: "gpu" +device: GPU ``` To deploy on Gaudi, please make sure `device` is set to hpu and `hpus_per_worker` is set to 1. ``` hpus_per_worker: 1 -device: "hpu" +device: HPU ``` LLM-on-Ray also supports serving with [Deepspeed](serve_deepspeed.md) for AutoTP and [BigDL-LLM](serve_bigdl.md) for INT4/FP4/INT8/FP8 to reduce latency. You can follow the corresponding documents to enable them. diff --git a/inference/inference_config.py b/inference/inference_config.py index 9b2a434df..812d579f4 100644 --- a/inference/inference_config.py +++ b/inference/inference_config.py @@ -106,8 +106,8 @@ def _check_port(cls, v: int): @validator('device') def _check_device(cls, v: str): if v: - assert v in [DEVICE_CPU, DEVICE_XPU, DEVICE_CUDA, DEVICE_HPU] - return v + assert v.lower() in [DEVICE_CPU, DEVICE_XPU, DEVICE_CUDA, DEVICE_HPU] + return v.lower() @validator('workers_per_group') def _check_workers_per_group(cls, v: int): diff --git a/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml b/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml index 6da907c24..4e367efcb 100644 --- a/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml +++ b/inference/models/bigdl/mistral-7b-v0.1-bigdl.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: false precision: bf16 diff --git a/inference/models/bigdl/mpt-7b-bigdl.yaml b/inference/models/bigdl/mpt-7b-bigdl.yaml index f306c0507..db5316b98 100644 --- a/inference/models/bigdl/mpt-7b-bigdl.yaml +++ b/inference/models/bigdl/mpt-7b-bigdl.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: false precision: bf16 diff --git a/inference/models/bloom-560m.yaml b/inference/models/bloom-560m.yaml index 43f63cb62..90e5e14af 100644 --- a/inference/models/bloom-560m.yaml +++ b/inference/models/bloom-560m.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 10 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: true precision: bf16 diff --git a/inference/models/gpt2.yaml b/inference/models/gpt2.yaml index 617c8a64d..17b354d39 100644 --- a/inference/models/gpt2.yaml +++ b/inference/models/gpt2.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: true precision: bf16 diff --git a/inference/models/mistral-7b-v0.1.yaml b/inference/models/mistral-7b-v0.1.yaml index 60ad1c602..8cd3750a7 100644 --- a/inference/models/mistral-7b-v0.1.yaml +++ b/inference/models/mistral-7b-v0.1.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: true precision: bf16 diff --git a/inference/models/mpt-7b.yaml b/inference/models/mpt-7b.yaml index b0b2ac7b9..979762313 100644 --- a/inference/models/mpt-7b.yaml +++ b/inference/models/mpt-7b.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: true precision: bf16 diff --git a/inference/models/neural-chat-7b-v3-1.yaml b/inference/models/neural-chat-7b-v3-1.yaml index db213b3c8..25427b660 100644 --- a/inference/models/neural-chat-7b-v3-1.yaml +++ b/inference/models/neural-chat-7b-v3-1.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: false precision: bf16 diff --git a/inference/models/opt-125m.yaml b/inference/models/opt-125m.yaml index c8f40aa04..5ff98cb4d 100644 --- a/inference/models/opt-125m.yaml +++ b/inference/models/opt-125m.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: CPU ipex: enabled: false precision: bf16 diff --git a/inference/models/template/inference_config_template.yaml b/inference/models/template/inference_config_template.yaml index 7a8a18507..f8b02843c 100644 --- a/inference/models/template/inference_config_template.yaml +++ b/inference/models/template/inference_config_template.yaml @@ -6,7 +6,7 @@ gpus_per_worker: 0 hpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: true precision: bf16