intel · Deegue · Feb 7, 2024 · Jan 8, 2024 · Jan 8, 2024 · Jan 8, 2024
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference test
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b ]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -45,6 +45,8 @@ jobs:
           - { model: "gpt-j-6b"}
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b-bigdl"}
+          - { model: "starcoder"}
+          - { model: "llama-2-7b-chat-hf"}
           - dtuner_model: nathan0/mpt-7b-deltatuner-model
             model: mpt-7b
 
@@ -123,7 +125,7 @@ jobs:
       - name: Run Inference Test with DeepSpeed
         run: |
           TARGET=${{steps.target.outputs.target}}
-          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|mpt-7b.*)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
@@ -136,7 +138,7 @@ jobs:
         if: ${{ matrix.dtuner_model }}
         run: |
           TARGET=${{steps.target.outputs.target}}
-          if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --serve_simple"

diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py
@@ -34,6 +34,7 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
+            use_auth_token=infer_conf.model_description.config.use_auth_token,
         )
 
         # get correct torch type for loading HF model
@@ -49,7 +50,11 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
         if model_desc.peft_model_id_or_path:
             from peft import PeftModel
 
-            self.model = PeftModel.from_pretrained(self.model, model_desc.peft_model_id_or_path)
+            self.model = PeftModel.from_pretrained(
+                self.model,
+                model_desc.peft_model_id_or_path,
+                use_auth_token=infer_conf.model_description.config.use_auth_token,
+            )
             if model_desc.peft_type == "deltatuner":
                 from deltatuner import DeltaTunerModel
 

diff --git a/inference/models/llama-2-7b-chat-hf.yaml b/inference/models/llama-2-7b-chat-hf.yaml
@@ -7,7 +7,7 @@ deepspeed: false
 workers_per_group: 2
 device: "cpu"
 ipex:
-  enabled: true
+  enabled: false
   precision: bf16
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf

diff --git a/inference/models/starcoder.yaml b/inference/models/starcoder.yaml
@@ -0,0 +1,22 @@
+port: 8000
+name: starcoder
+route_prefix: /starcoder
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+ipex:
+  enabled: false
+  precision: bf16
+device: "cpu"
+model_description:  
+  model_id_or_path: bigcode/starcoder
+  tokenizer_name_or_path: bigcode/starcoder
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: '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'
diff --git a/inference/predictor.py b/inference/predictor.py
@@ -9,7 +9,8 @@ class Predictor:
     def __init__(self, infer_conf: InferenceConfig) -> None:
         self.infer_conf = infer_conf
         self.tokenizer = AutoTokenizer.from_pretrained(
-            infer_conf.model_description.tokenizer_name_or_path
+            infer_conf.model_description.tokenizer_name_or_path,
+            **infer_conf.model_description.config.dict(),
         )
         self.device = torch.device(infer_conf.device)
         # now deepspeed predictor don't have the model

diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py
@@ -16,6 +16,7 @@ def __init__(self, infer_conf: InferenceConfig):
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
+            use_auth_token=infer_conf.model_description.config.use_auth_token,
         )
 
         if self.device.type == "hpu":
@@ -53,7 +54,11 @@ def __init__(self, infer_conf: InferenceConfig):
         if model_desc.peft_model_id_or_path:
             from peft import PeftModel
 
-            model = PeftModel.from_pretrained(model, model_desc.peft_model_id_or_path)
+            model = PeftModel.from_pretrained(
+                model,
+                model_desc.peft_model_id_or_path,
+                use_auth_token=infer_conf.model_description.config.use_auth_token,
+            )
             if model_desc.peft_type == "deltatuner":
                 from deltatuner import DeltaTunerModel