[CI] Add llama2-70b inference workflow (#208)

* add llama-2-70b * nit * fix vllm inference ci * Revert "fix vllm inference ci" This reverts commit 36062bd.
intel · May 10, 2024 · cc1556d · cc1556d
1 parent e093eb8
commit cc1556d
Showing 1 changed file with 13 additions and 1 deletion.
diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ llama-2-7b-chat-hf ]
+        model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -37,6 +37,7 @@ jobs:
 
         include:
           - { model: "llama-2-7b-chat-hf"}
+          - { model: "llama-2-70b-chat-hf"}
 
     runs-on: gaudi2
 
@@ -60,6 +61,8 @@ jobs:
           target="inference"
           if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
             target="${target}_gaudi2"
+          elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
+            target="${target}_gaudi2"
           fi
           echo "target is ${target}"
           echo "target=$target" >> $GITHUB_OUTPUT
@@ -105,11 +108,20 @@ jobs:
                   result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
               with open(conf_path, 'w') as output:
                   yaml.dump(result, output, sort_keys=False)
+          elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"):
+              conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml"
+              with open(conf_path, encoding="utf-8") as reader:
+                  result = yaml.load(reader, Loader=yaml.FullLoader)
+                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+              with open(conf_path, 'w') as output:
+                  yaml.dump(result, output, sort_keys=False)
           EOF
           )
           docker exec "${TARGET}" python -c "$CMD"
           if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --simple"
+          elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --simple"
           fi
           echo Non-streaming query:
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/${{ matrix.model }}"