From f9d0107b99a9e051e27e099f1bad290d7e970dad Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 23 Apr 2024 00:30:05 -0700
Subject: [PATCH 01/57] update unit test

---
 .github/actions/llm/setup-llm-env/action.yml  |  1 +
 .github/workflows/llm_unit_tests.yml          | 51 +++++++++++--------
 .../test/inference/test_transformers_api.py   |  4 +-
 .../test/langchain/test_transformers_api.py   |  2 +-
 .../llm/test/run-llm-inference-tests-gpu.sh   |  6 +--
 python/llm/test/run-llm-inference-tests.sh    |  4 --
 6 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/.github/actions/llm/setup-llm-env/action.yml b/.github/actions/llm/setup-llm-env/action.yml
index 4b25ea0c401..4d0b7550f74 100644
--- a/.github/actions/llm/setup-llm-env/action.yml
+++ b/.github/actions/llm/setup-llm-env/action.yml
@@ -42,3 +42,4 @@ runs:
           pip install pytest
           bash python/llm/test/run-llm-install-tests.sh
         fi
+        pip install transformers==4.36.2
diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 5eb5b55e31a..5233a7f879c 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -77,6 +77,7 @@ jobs:
         run: |
           echo "DATASET_DIR=${{ github.workspace }}/../llm/datasets" >> "$GITHUB_ENV"
           echo "ORIGIN_DIR=${{ github.workspace }}/../llm/origin-models" >> "$GITHUB_ENV"
+          echo "ORIGIN_DIR_436=${{ github.workspace }}/../llm/origin-models-4.36" >> "$GITHUB_ENV"
           echo "INT4_CKPT_DIR=${{ github.workspace }}/../llm/converted-models" >> "$GITHUB_ENV"
       - name: Create model directories
         shell: bash
@@ -87,6 +88,9 @@ jobs:
           if [ ! -d $ORIGIN_DIR ]; then
             mkdir -p $ORIGIN_DIR
           fi
+          if [ ! -d ORIGIN_DIR_436 ]; then
+            mkdir -p ORIGIN_DIR_436
+          fi
           if [ ! -d $INT4_CKPT_DIR ]; then
             mkdir -p $INT4_CKPT_DIR
           fi
@@ -98,7 +102,7 @@ jobs:
 
           echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
           echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
-          echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR_436}/chatglm2-6b" >> "$GITHUB_ENV"
           echo "ORIGINAL_REPLIT_CODE_PATH=${ORIGIN_DIR}/replit-code-v1-3b" >> "$GITHUB_ENV"
           echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
           echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
@@ -157,8 +161,8 @@ jobs:
           # fi
           if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
             echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
-            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"            
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
           fi
           if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then
             echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..."
@@ -210,23 +214,28 @@ jobs:
       - name: Run LLM cli test (Windows)
         if: runner.os == 'Windows' 
         uses: ./.github/actions/llm/cli-test-windows
-      - name: Run LLM inference test
+#      - name: Run LLM inference test
+#        shell: bash
+#        run: |
+#          python -m pip install einops datasets librosa openai-whisper
+#          bash python/llm/test/run-llm-inference-tests.sh
+      - name: Run LLM inference test for 4.36
         shell: bash
         run: |
           python -m pip install einops datasets librosa openai-whisper
           bash python/llm/test/run-llm-inference-tests.sh
-      - name: Run LLM langchain test
-        shell: bash
-        run: |
-          pip install -U langchain==0.0.184
-          pip install -U chromadb==0.3.25
-          pip install -U pandas==2.0.3
-          bash python/llm/test/run-llm-langchain-tests.sh
+#      - name: Run LLM langchain test
+#        shell: bash
+#        run: |
+#          pip install -U langchain==0.0.184
+#          pip install -U chromadb==0.3.25
+#          pip install -U pandas==2.0.3
+#          bash python/llm/test/run-llm-langchain-tests.sh
       - name: Run LLM llamaindex test
         shell: bash
         run: |
           pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
-          pip install transformers==4.36.0
+#          pip install transformers==4.36.0
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests.sh
       - name: Run sentence-transformers uninstallation
@@ -255,12 +264,12 @@ jobs:
           echo "SPEECH_DATASET_PATH=${ORIGIN_DIR}/../datasets/librispeech_asr_dummy" >> "$GITHUB_ENV"
 
           echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
-          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR_436}/chatglm2-6b" >> "$GITHUB_ENV"
           echo "FALCON_7B_ORIGIN_PATH=${ORIGIN_DIR}/falcon-7b-instruct-with-patch" >> "$GITHUB_ENV"
-          echo "MPT_7B_ORIGIN_PATH=${ORIGIN_DIR}/mpt-7b-chat" >> "$GITHUB_ENV"
+          echo "MPT_7B_ORIGIN_PATH=${ORIGIN_DIR_436}/mpt-7b-chat" >> "$GITHUB_ENV"
           echo "WHISPER_TINY_ORIGIN_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
           echo "MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-Instruct-v0.1" >> "$GITHUB_ENV"
-          echo "BAICHUAN2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Baichuan2-7B-Chat" >> "$GITHUB_ENV"
+          echo "BAICHUAN2_7B_ORIGIN_PATH=${ORIGIN_DIR_436}/Baichuan2-7B-Chat" >> "$GITHUB_ENV"
           echo "QWEN_7B_ORIGIN_PATH=${ORIGIN_DIR}/Qwen-7B-Chat" >> "$GITHUB_ENV"
           echo "VICUNA_7B_1_3_ORIGIN_PATH=${ORIGIN_DIR}/vicuna-7b-v1.3" >> "$GITHUB_ENV"
       - name: Checkout repo
@@ -310,7 +319,7 @@ jobs:
           fi
           if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
             echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36//chatglm2-6b -P $ORIGIN_DIR_436
           fi
           if [ ! -d $FALCON_7B_ORIGIN_PATH ]; then
             echo "Directory $FALCON_7B_ORIGIN_PATH not found. Downloading from FTP server..."
@@ -318,7 +327,7 @@ jobs:
           fi
           if [ ! -d $MPT_7B_ORIGIN_PATH ]; then
             echo "Directory $MPT_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/mpt-7b-chat -P $ORIGIN_DIR
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36//mpt-7b-chat -P $ORIGIN_DIR_436
           fi
           if [ ! -d $WHISPER_TINY_ORIGIN_PATH ]; then
             echo "Directory $WHISPER_TINY_ORIGIN_PATH not found. Downloading from FTP server..."
@@ -345,7 +354,7 @@ jobs:
           fi
           if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
             echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Baichuan2-7B-Chat -P $ORIGIN_DIR
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36//Baichuan2-7B-Chat -P $ORIGIN_DIR_436
           fi
           if [ ! -d $VICUNA_7B_1_3_ORIGIN_PATH ]; then
             echo "Directory $VICUNA_7B_1_3_ORIGIN_PATH not found. Downloading from FTP server..."
@@ -363,8 +372,8 @@ jobs:
           fi
           python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
           bash python/llm/test/run-llm-inference-tests-gpu.sh
-          python -m pip install transformers==4.34.0 
-          bash python/llm/test/run-llm-inference-tests-gpu-434.sh
+#          python -m pip install transformers==4.34.0
+#          bash python/llm/test/run-llm-inference-tests-gpu-434.sh
 
       - name: Run LLM example tests
         shell: bash
@@ -410,7 +419,7 @@ jobs:
             pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
             source /home/arda/intel/oneapi/setvars.sh
           fi
-          pip install transformers==4.36.0
+#          pip install transformers==4.36.0
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
       - name: Run sentence-transformers uninstallation
diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index 1a72801cc1a..ea54c6a29cc 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -116,6 +116,7 @@ def test_transformers_chatglm_for_causallm(self):
     ])
 @pytest.mark.parametrize('Model, Tokenizer, model_path',[
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
     ])
 def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -143,7 +144,8 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
-    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt)
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
 ])
     
 def test_optimize_model(Model, Tokenizer, model_path, prompt):
diff --git a/python/llm/test/langchain/test_transformers_api.py b/python/llm/test/langchain/test_transformers_api.py
index cbaaa1e0ba7..61d30051b6f 100644
--- a/python/llm/test/langchain/test_transformers_api.py
+++ b/python/llm/test/langchain/test_transformers_api.py
@@ -38,7 +38,7 @@
 class Test_Langchain_Transformers_API(TestCase):
     def setUp(self):
         self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
-        self.auto_causal_model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
+        # self.auto_causal_model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
         self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
         self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
         thread_num = os.environ.get('THREAD_NUM')
diff --git a/python/llm/test/run-llm-inference-tests-gpu.sh b/python/llm/test/run-llm-inference-tests-gpu.sh
index ea1abb519f4..5e48c0df876 100644
--- a/python/llm/test/run-llm-inference-tests-gpu.sh
+++ b/python/llm/test/run-llm-inference-tests-gpu.sh
@@ -21,9 +21,9 @@ pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v -s
 pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_layernorm.py -v -s
 export BIGDL_LLM_XMX_DISABLED=1
 pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_final_logits.py -v -s
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "not Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "not Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "not Mistral"
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s
+pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s
 unset BIGDL_LLM_XMX_DISABLED
 
 now=$(date "+%s")
diff --git a/python/llm/test/run-llm-inference-tests.sh b/python/llm/test/run-llm-inference-tests.sh
index e53528dbb56..d3c3c0690ef 100644
--- a/python/llm/test/run-llm-inference-tests.sh
+++ b/python/llm/test/run-llm-inference-tests.sh
@@ -18,10 +18,6 @@ export OMP_NUM_THREADS=$THREAD_NUM
 python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformers_api.py -v
 python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_optimize_model_api.py -v
 
-python -m pip install transformers==4.34.0
-python -m pytest -s ${LLM_INFERENCE_TEST_DIR}/test_transformesr_api_434.py -v
-python -m pip install transformers==4.31.0
-
 now=$(date "+%s")
 time=$((now-start))
 

From a86c35fd610dbb6f2947ddb0d4619fd3f3c2c886 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 23 Apr 2024 00:33:28 -0700
Subject: [PATCH 02/57] update

---
 .github/workflows/llm_unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 5233a7f879c..7e2e796ba87 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -161,7 +161,7 @@ jobs:
           # fi
           if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
             echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
-            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR"
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436"
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
           fi
           if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then

From 8a6e0f24f6fea51add9c39fd7b3c6a47c8ba48cc Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 23 Apr 2024 00:38:39 -0700
Subject: [PATCH 03/57] update

---
 .../test/inference/test_transformers_api.py   | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index ea54c6a29cc..8b208bdd3c1 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -52,23 +52,23 @@ def test_transformers_auto_model_int4(self):
         res = 'Paris' in output_str        
         self.assertTrue(res)
 
-    def test_transformers_auto_model_for_causal_lm_int4(self):
-        model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        input_str = 'def hello():\n  print("hello world")\n'
-        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
-        with torch.inference_mode():
-            
-            st = time.time()
-            input_ids = tokenizer.encode(input_str, return_tensors="pt")
-            output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
-            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
-            end = time.time()
-        print('Prompt:', input_str)
-        print('Output:', output_str)
-        print(f'Inference time: {end-st} s')
-        res = '\nhello()' in output_str        
-        self.assertTrue(res)
+    # def test_transformers_auto_model_for_causal_lm_int4(self):
+    #     model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
+    #     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    #     input_str = 'def hello():\n  print("hello world")\n'
+    #     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
+    #     with torch.inference_mode():
+    #
+    #         st = time.time()
+    #         input_ids = tokenizer.encode(input_str, return_tensors="pt")
+    #         output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+    #         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+    #         end = time.time()
+    #     print('Prompt:', input_str)
+    #     print('Output:', output_str)
+    #     print(f'Inference time: {end-st} s')
+    #     res = '\nhello()' in output_str
+    #     self.assertTrue(res)
         
 
     def test_transformers_auto_model_for_speech_seq2seq_int4(self):
@@ -144,7 +144,7 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
-    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt),
+    # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
 ])
     

From d658968e07dca5ad768a118a8405b724cc5cf327 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 23 Apr 2024 00:49:59 -0700
Subject: [PATCH 04/57] update

---
 .github/workflows/llm_unit_tests.yml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 7e2e796ba87..c85cba9091f 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -224,18 +224,18 @@ jobs:
         run: |
           python -m pip install einops datasets librosa openai-whisper
           bash python/llm/test/run-llm-inference-tests.sh
-#      - name: Run LLM langchain test
-#        shell: bash
-#        run: |
-#          pip install -U langchain==0.0.184
-#          pip install -U chromadb==0.3.25
-#          pip install -U pandas==2.0.3
-#          bash python/llm/test/run-llm-langchain-tests.sh
+      # - name: Run LLM langchain test
+        # shell: bash
+        # run: |
+          # pip install -U langchain==0.0.184
+          # pip install -U chromadb==0.3.25
+          # pip install -U pandas==2.0.3
+          # bash python/llm/test/run-llm-langchain-tests.sh
       - name: Run LLM llamaindex test
         shell: bash
         run: |
           pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
-#          pip install transformers==4.36.0
+          # pip install transformers==4.36.0
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests.sh
       - name: Run sentence-transformers uninstallation
@@ -372,8 +372,8 @@ jobs:
           fi
           python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
           bash python/llm/test/run-llm-inference-tests-gpu.sh
-#          python -m pip install transformers==4.34.0
-#          bash python/llm/test/run-llm-inference-tests-gpu-434.sh
+          # python -m pip install transformers==4.34.0
+          # bash python/llm/test/run-llm-inference-tests-gpu-434.sh
 
       - name: Run LLM example tests
         shell: bash
@@ -419,7 +419,7 @@ jobs:
             pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
             source /home/arda/intel/oneapi/setvars.sh
           fi
-#          pip install transformers==4.36.0
+          # pip install transformers==4.36.0
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
       - name: Run sentence-transformers uninstallation

From 66639dce4202e1f092a2c78c42c8e5846ac70471 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 23 Apr 2024 00:53:22 -0700
Subject: [PATCH 05/57] update

---
 .github/workflows/llm_unit_tests.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index c85cba9091f..23b061b585f 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -214,11 +214,11 @@ jobs:
       - name: Run LLM cli test (Windows)
         if: runner.os == 'Windows' 
         uses: ./.github/actions/llm/cli-test-windows
-#      - name: Run LLM inference test
-#        shell: bash
-#        run: |
-#          python -m pip install einops datasets librosa openai-whisper
-#          bash python/llm/test/run-llm-inference-tests.sh
+      # - name: Run LLM inference test
+        # shell: bash
+        # run: |
+          # python -m pip install einops datasets librosa openai-whisper
+          # bash python/llm/test/run-llm-inference-tests.sh
       - name: Run LLM inference test for 4.36
         shell: bash
         run: |

From e77cee4a7b8b7dd5a9c258abe997666f5e3c76eb Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 23 Apr 2024 11:40:32 -0700
Subject: [PATCH 06/57] update

---
 .github/workflows/llm_unit_tests.yml | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 23b061b585f..2e776a3d8e9 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -161,8 +161,8 @@ jobs:
           # fi
           if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
             echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
-            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436"
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
+            echo "wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436"
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
           fi
           if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then
             echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..."
@@ -256,6 +256,16 @@ jobs:
       # THREAD_NUM: 16
       ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
     steps:
+      - name: Set model directories for 4.36
+        shell: bash
+        run: |
+          echo "ORIGIN_DIR_436=${{ github.workspace }}/../llm/origin-models-4.36" >> "$GITHUB_ENV"
+      - name: Create model directories
+        shell: bash
+        run: |
+          if [ ! -d ORIGIN_DIR_436 ]; then
+            mkdir -p ORIGIN_DIR_436
+          fi
       - name: Set environment variables
         shell: bash
         run: |
@@ -319,7 +329,7 @@ jobs:
           fi
           if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
             echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36//chatglm2-6b -P $ORIGIN_DIR_436
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
           fi
           if [ ! -d $FALCON_7B_ORIGIN_PATH ]; then
             echo "Directory $FALCON_7B_ORIGIN_PATH not found. Downloading from FTP server..."
@@ -327,7 +337,7 @@ jobs:
           fi
           if [ ! -d $MPT_7B_ORIGIN_PATH ]; then
             echo "Directory $MPT_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36//mpt-7b-chat -P $ORIGIN_DIR_436
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/mpt-7b-chat -P $ORIGIN_DIR_436
           fi
           if [ ! -d $WHISPER_TINY_ORIGIN_PATH ]; then
             echo "Directory $WHISPER_TINY_ORIGIN_PATH not found. Downloading from FTP server..."
@@ -354,7 +364,7 @@ jobs:
           fi
           if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
             echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/updated_for_4.36//Baichuan2-7B-Chat -P $ORIGIN_DIR_436
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/Baichuan2-7B-Chat -P $ORIGIN_DIR_436
           fi
           if [ ! -d $VICUNA_7B_1_3_ORIGIN_PATH ]; then
             echo "Directory $VICUNA_7B_1_3_ORIGIN_PATH not found. Downloading from FTP server..."

From f1d694469c68ebd74bbcaca7dc1d8d242473051e Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 24 Apr 2024 15:24:42 -0700
Subject: [PATCH 07/57] fix gpu attention test

---
 .../test/inference_gpu/test_transformers_api_attention.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py
index 0990f8ad4b9..83f7aaebfc8 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_attention.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py
@@ -104,8 +104,8 @@ def replace_forward_hook(module, input, output, layer_name):
                     if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
                         # 'attn_output' is of type torch.Tensor.
                         attn_output_diff.append(t1 - t2)
-                    else:
-                        # 'past_key_value'is of type tuple as default.
+                    elif isinstance(t1, tuple) and isinstance(t2, tuple):
+                        # if 'past_key_value'is of type tuple
                         for i, (t3, t4) in enumerate(zip(t1, t2)):
                             if model.config.architectures[0] == "ChatGLMModel" and \
                                     hasattr(model.config, 'padded_vocab_size') and \
@@ -114,6 +114,10 @@ def replace_forward_hook(module, input, output, layer_name):
                                 # We need to narrow it here.
                                 t4 = t4[:, :, 15:17, :]
                             attn_output_diff.append(t3 - t4)
+                    else:
+                        # if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
+                        attn_output_diff.append(t1[-1][0] - t2[-1][0])
+                        attn_output_diff.append(t1[-1][1] - t2[-1][1])
 
             max_diff_tensor = [torch.max(item).item() for item in attn_output_diff]
             print(max_diff_tensor)

From c2fa88b43e8cfa49406f4454508734fd6852e175 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 24 Apr 2024 15:32:35 -0700
Subject: [PATCH 08/57] update

---
 .github/workflows/llm_unit_tests.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 2e776a3d8e9..f228d9f45eb 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -362,6 +362,10 @@ jobs:
             echo "Directory $QWEN_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Qwen-7B-Chat -P $ORIGIN_DIR
           fi
+          if [ -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
+            echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
+            rm -rf $BAICHUAN2_7B_ORIGIN_PATH
+          fi
           if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
             echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/Baichuan2-7B-Chat -P $ORIGIN_DIR_436

From b255ac53353261ba395a57cca7faeb19c2156740 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 24 Apr 2024 16:13:06 -0700
Subject: [PATCH 09/57] update

---
 python/llm/test/inference_gpu/test_transformers_api_mlp.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
index e3273ad574e..70ba2e7b9f6 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
@@ -96,9 +96,14 @@ def replace_forward_hook(module, input, output, layer_name):
             for i, (t1, t2) in enumerate(zip(layer_tensor, opt_layer_tensor)):
                 if isinstance(t1, torch.Tensor) and isinstance(t2, torch.Tensor):
                     MLP_output_diff.append(t1 - t2)
-                else:
+                elif isinstance(t1, tuple) and isinstance(t2, tuple):
+                    # if 'past_key_value'is of type tuple
                     for i, (t3, t4) in enumerate(zip(t1, t2)):
                         MLP_output_diff.append(t3 - t4)
+                else:
+                    # if 'past_key_value'is of type Cache, get last layer cache pair (key, value)
+                    MLP_output_diff.append(t1[-1][0] - t2[-1][0])
+                    MLP_output_diff.append(t1[-1][1] - t2[-1][1])
 
             max_diff_tensor = [torch.max(item).item() for item in MLP_output_diff]
             print(max_diff_tensor)

From a82199ae6db63fe3c5bc020350b18c83f7e777ec Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 24 Apr 2024 16:16:13 -0700
Subject: [PATCH 10/57] update

---
 .github/workflows/llm_unit_tests.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index f228d9f45eb..2e776a3d8e9 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -362,10 +362,6 @@ jobs:
             echo "Directory $QWEN_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Qwen-7B-Chat -P $ORIGIN_DIR
           fi
-          if [ -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
-            echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            rm -rf $BAICHUAN2_7B_ORIGIN_PATH
-          fi
           if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
             echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/Baichuan2-7B-Chat -P $ORIGIN_DIR_436

From 7e7d09c94869e0cd0af2c1bf1c90c592ccad28d0 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 25 Apr 2024 12:39:39 -0700
Subject: [PATCH 11/57] update

---
 python/llm/test/inference/test_transformers_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index 8b208bdd3c1..7a11dfe6e8a 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -141,7 +141,7 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
 prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
 
 @pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [
-    (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
+    # (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
     # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt),

From 8f1c35571a2d64cd527c1330fe325d08c93e7043 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 25 Apr 2024 13:39:28 -0700
Subject: [PATCH 12/57] update

---
 .github/workflows/llm_unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 2e776a3d8e9..815d523e9a6 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -389,7 +389,7 @@ jobs:
         shell: bash
         run: |
           python -m pip uninstall datasets -y
-          python -m pip install transformers==4.34.0 datasets peft==0.5.0 accelerate==0.23.0
+          python -m pip install datasets peft==0.5.0 accelerate==0.23.0
           python -m pip install bitsandbytes scipy
           # Specific oneapi position on arc ut test machines
           if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then

From e0c4407cfd5b1f466e3a5fc533d7329d9357cb7a Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 25 Apr 2024 15:29:21 -0700
Subject: [PATCH 13/57] update

---
 .github/workflows/llm_unit_tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 815d523e9a6..85f79492597 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -235,7 +235,7 @@ jobs:
         shell: bash
         run: |
           pip install llama-index-readers-file llama-index-vector-stores-postgres llama-index-embeddings-huggingface
-          # pip install transformers==4.36.0
+          pip install transformers==4.36.2
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests.sh
       - name: Run sentence-transformers uninstallation
@@ -429,7 +429,7 @@ jobs:
             pip install --pre --upgrade ipex-llm[xpu_2.0] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
             source /home/arda/intel/oneapi/setvars.sh
           fi
-          # pip install transformers==4.36.0
+          pip install transformers==4.36.2
           pip install "pydantic>=2.0.0"
           bash python/llm/test/run-llm-llamaindex-tests-gpu.sh
       - name: Run sentence-transformers uninstallation

From c51b7ea789517f8a6afb913b7234ca7d5d10dd4f Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 25 Apr 2024 17:17:53 -0700
Subject: [PATCH 14/57] update example test

---
 python/llm/dev/test/run-example-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/dev/test/run-example-tests.sh b/python/llm/dev/test/run-example-tests.sh
index edf767c8719..f099091418a 100644
--- a/python/llm/dev/test/run-example-tests.sh
+++ b/python/llm/dev/test/run-example-tests.sh
@@ -68,7 +68,7 @@ fi
 export ORIGINAL_CHATGLM2_PATH=./llm/chatglm2-6b/
 if [ ! -d $ORIGINAL_CHATGLM2_PATH ]; then
     echo "Directory $ORIGINAL_CHATGLM2_PATH not found. Downloading from FTP server..."
-    wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/${ORIGINAL_CHATGLM2_PATH:2} -P $LLM_DIR
+    wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/updated_for_4.36/${ORIGINAL_CHATGLM2_PATH:2} -P $LLM_DIR
 fi
 
 echo ">>> Testing ChatGLM2 transformers API"

From a4427683e39b8c82e9dd6fe219e507d268186f82 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 26 Apr 2024 14:52:19 -0700
Subject: [PATCH 15/57] replace replit code

---
 .github/workflows/llm_unit_tests.yml          | 24 +++++-----
 .../test/inference/test_transformers_api.py   | 48 +++++++++----------
 .../test/langchain/test_transformers_api.py   | 26 +++++-----
 3 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 85f79492597..663fcb11da7 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -103,7 +103,7 @@ jobs:
           echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
           echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
           echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR_436}/chatglm2-6b" >> "$GITHUB_ENV"
-          echo "ORIGINAL_REPLIT_CODE_PATH=${ORIGIN_DIR}/replit-code-v1-3b" >> "$GITHUB_ENV"
+          echo "ORIGINAL_CODESHELL_7B_PATH=${ORIGIN_DIR}/CodeShell-7B-Chat" >> "$GITHUB_ENV"
           echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
           echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
           echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
@@ -164,10 +164,10 @@ jobs:
             echo "wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436"
             wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
           fi
-          if [ ! -d $ORIGINAL_REPLIT_CODE_PATH ]; then
-            echo "Directory $ORIGINAL_REPLIT_CODE_PATH not found. Downloading from FTP server..."
-            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR"
-            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/replit-code-v1-3b -P $ORIGIN_DIR
+          if [ ! -d $ORIGINAL_CODESHELL_7B_PATH ]; then
+            echo "Directory $ORIGINAL_CODESHELL_7B_PATH not found. Downloading from FTP server..."
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/CodeShell-7B-Chat -P $ORIGIN_DIR
           fi
           if [ ! -d $ORIGINAL_WHISPER_TINY_PATH ]; then
             echo "Directory $ORIGINAL_WHISPER_TINY_PATH not found. Downloading from FTP server..."
@@ -224,13 +224,13 @@ jobs:
         run: |
           python -m pip install einops datasets librosa openai-whisper
           bash python/llm/test/run-llm-inference-tests.sh
-      # - name: Run LLM langchain test
-        # shell: bash
-        # run: |
-          # pip install -U langchain==0.0.184
-          # pip install -U chromadb==0.3.25
-          # pip install -U pandas==2.0.3
-          # bash python/llm/test/run-llm-langchain-tests.sh
+       - name: Run LLM langchain test
+         shell: bash
+         run: |
+           pip install -U langchain==0.0.184
+           pip install -U chromadb==0.3.25
+           pip install -U pandas==2.0.3
+           bash python/llm/test/run-llm-langchain-tests.sh
       - name: Run LLM llamaindex test
         shell: bash
         run: |
diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index 7a11dfe6e8a..3fadd390aa3 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -35,7 +35,7 @@ def setUp(self):
             self.n_threads = 2
 
     def test_transformers_auto_model_int4(self):
-        model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
+        model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
         model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         input_str = "Tell me the capital of France.\n\n"
@@ -49,26 +49,26 @@ def test_transformers_auto_model_int4(self):
         print('Prompt:', input_str)
         print('Output:', output_str)
         print(f'Inference time: {end-st} s')
-        res = 'Paris' in output_str        
+        res = 'Paris' in output_str
         self.assertTrue(res)
 
-    # def test_transformers_auto_model_for_causal_lm_int4(self):
-    #     model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
-    #     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    #     input_str = 'def hello():\n  print("hello world")\n'
-    #     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
-    #     with torch.inference_mode():
-    #
-    #         st = time.time()
-    #         input_ids = tokenizer.encode(input_str, return_tensors="pt")
-    #         output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
-    #         output_str = tokenizer.decode(output[0], skip_special_tokens=True)
-    #         end = time.time()
-    #     print('Prompt:', input_str)
-    #     print('Output:', output_str)
-    #     print(f'Inference time: {end-st} s')
-    #     res = '\nhello()' in output_str
-    #     self.assertTrue(res)
+    def test_transformers_auto_model_for_causal_lm_int4(self):
+        model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        input_str = 'def hello():\n  print("hello world")\n'
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
+        with torch.inference_mode():
+
+            st = time.time()
+            input_ids = tokenizer.encode(input_str, return_tensors="pt")
+            output = model.generate(input_ids, do_sample=False, max_new_tokens=32)
+            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+            end = time.time()
+        print('Prompt:', input_str)
+        print('Output:', output_str)
+        print(f'Inference time: {end-st} s')
+        res = '\nhello()' in output_str
+        self.assertTrue(res)
         
 
     def test_transformers_auto_model_for_speech_seq2seq_int4(self):
@@ -86,7 +86,7 @@ def test_transformers_auto_model_for_speech_seq2seq_int4(self):
             predicted_ids = model.generate(input_features)
             # decode token ids to text
             transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
-            end = time.time()        
+            end = time.time()
         print('Output:', transcription)
         print(f'Inference time: {end-st} s')
         res = 'Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.' in transcription[0]
@@ -108,7 +108,7 @@ def test_transformers_chatglm_for_causallm(self):
         print('Prompt:', input_str)
         print('Output:', output_str)
         print(f'Inference time: {end-st} s')
-        res = 'Paris' in output_str        
+        res = 'Paris' in output_str
         self.assertTrue(res)
 
 @pytest.mark.parametrize('prompt, answer', [
@@ -124,7 +124,7 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
                                   load_in_4bit=True,
                                   optimize_model=True,
                                   trust_remote_code=True)
-    
+
     with tempfile.TemporaryDirectory() as tempdir:
         model.save_low_bit(tempdir)
         loaded_model = Model.load_low_bit(tempdir,
@@ -144,10 +144,10 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     # (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
-    # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_REPLIT_CODE_PATH'), prompt),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_CODESHELL_7B_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
 ])
-    
+
 def test_optimize_model(Model, Tokenizer, model_path, prompt):
     tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
diff --git a/python/llm/test/langchain/test_transformers_api.py b/python/llm/test/langchain/test_transformers_api.py
index 61d30051b6f..ad139c74dc6 100644
--- a/python/llm/test/langchain/test_transformers_api.py
+++ b/python/llm/test/langchain/test_transformers_api.py
@@ -38,7 +38,7 @@
 class Test_Langchain_Transformers_API(TestCase):
     def setUp(self):
         self.auto_model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
-        # self.auto_causal_model_path = os.environ.get('ORIGINAL_REPLIT_CODE_PATH')
+        self.auto_causal_model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
         self.llama_model_path = os.environ.get('LLAMA_ORIGIN_PATH')
         self.bloom_model_path = os.environ.get('BLOOM_ORIGIN_PATH')
         thread_num = os.environ.get('THREAD_NUM')
@@ -79,12 +79,12 @@ def test_transformers_llama_embeddings(self):
 
     def test_qa_chain(self):
         texts = '''
-            AI is a machine’s ability to perform the cognitive functions 
-            we associate with human minds, such as perceiving, reasoning, 
+            AI is a machine’s ability to perform the cognitive functions
+            we associate with human minds, such as perceiving, reasoning,
             learning, interacting with an environment, problem solving,
-            and even exercising creativity. You’ve probably interacted 
-            with AI even if you didn’t realize it—voice assistants like Siri 
-            and Alexa are founded on AI technology, as are some customer 
+            and even exercising creativity. You’ve probably interacted
+            with AI even if you didn’t realize it—voice assistants like Siri
+            and Alexa are founded on AI technology, as are some customer
             service chatbots that pop up to help you navigate websites.
             '''
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
@@ -102,16 +102,16 @@ def test_qa_chain(self):
         res = "AI" in output
         self.assertTrue(res)
 
-    
+
     """
     def test_qa_chain_causalLM(self):
         texts = '''
-            AI is a machine’s ability to perform the cognitive functions 
-            we associate with human minds, such as perceiving, reasoning, 
+            AI is a machine’s ability to perform the cognitive functions
+            we associate with human minds, such as perceiving, reasoning,
             learning, interacting with an environment, problem solving,
-            and even exercising creativity. You’ve probably interacted 
-            with AI even if you didn’t realize it—voice assistants like Siri 
-            and Alexa are founded on AI technology, as are some customer 
+            and even exercising creativity. You’ve probably interacted
+            with AI even if you didn’t realize it—voice assistants like Siri
+            and Alexa are founded on AI technology, as are some customer
             service chatbots that pop up to help you navigate websites.
             '''
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
@@ -129,7 +129,7 @@ def test_qa_chain_causalLM(self):
         res = "AI" in output
         self.assertTrue(res)
     """
-    
+
     def test_embed_kwargs(self):
         embeddings = TransformersEmbeddings.from_model_id(model_id=self.llama_model_path)
         encode_kwargs =  {"truncation": True, "max_length": 512}

From 5563f288b68ae200bc563b807d5bc04170893208 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 26 Apr 2024 15:05:07 -0700
Subject: [PATCH 16/57] update

---
 .github/workflows/llm_unit_tests.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 663fcb11da7..c77987ec783 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -224,13 +224,13 @@ jobs:
         run: |
           python -m pip install einops datasets librosa openai-whisper
           bash python/llm/test/run-llm-inference-tests.sh
-       - name: Run LLM langchain test
-         shell: bash
-         run: |
-           pip install -U langchain==0.0.184
-           pip install -U chromadb==0.3.25
-           pip install -U pandas==2.0.3
-           bash python/llm/test/run-llm-langchain-tests.sh
+      - name: Run LLM langchain test
+        shell: bash
+        run: |
+          pip install -U langchain==0.0.184
+          pip install -U chromadb==0.3.25
+          pip install -U pandas==2.0.3
+          bash python/llm/test/run-llm-langchain-tests.sh
       - name: Run LLM llamaindex test
         shell: bash
         run: |

From b575c48734b4aeae4eb31d633d438635cb0179b5 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 26 Apr 2024 15:13:19 -0700
Subject: [PATCH 17/57] update

---
 python/llm/dev/test/run-example-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/dev/test/run-example-tests.sh b/python/llm/dev/test/run-example-tests.sh
index f099091418a..e5b3d125add 100644
--- a/python/llm/dev/test/run-example-tests.sh
+++ b/python/llm/dev/test/run-example-tests.sh
@@ -68,7 +68,7 @@ fi
 export ORIGINAL_CHATGLM2_PATH=./llm/chatglm2-6b/
 if [ ! -d $ORIGINAL_CHATGLM2_PATH ]; then
     echo "Directory $ORIGINAL_CHATGLM2_PATH not found. Downloading from FTP server..."
-    wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/updated_for_4.36/${ORIGINAL_CHATGLM2_PATH:2} -P $LLM_DIR
+    wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/${ORIGINAL_CHATGLM2_PATH:5} -P $LLM_DIR
 fi
 
 echo ">>> Testing ChatGLM2 transformers API"

From cc0ed3006786b0d33e39d1bbd97400313769798f Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 26 Apr 2024 15:53:51 -0700
Subject: [PATCH 18/57] update

---
 python/llm/dev/test/run-example-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/dev/test/run-example-tests.sh b/python/llm/dev/test/run-example-tests.sh
index e5b3d125add..3876fa77a68 100644
--- a/python/llm/dev/test/run-example-tests.sh
+++ b/python/llm/dev/test/run-example-tests.sh
@@ -68,7 +68,7 @@ fi
 export ORIGINAL_CHATGLM2_PATH=./llm/chatglm2-6b/
 if [ ! -d $ORIGINAL_CHATGLM2_PATH ]; then
     echo "Directory $ORIGINAL_CHATGLM2_PATH not found. Downloading from FTP server..."
-    wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/${ORIGINAL_CHATGLM2_PATH:5} -P $LLM_DIR
+    wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/${ORIGINAL_CHATGLM2_PATH:6} -P $LLM_DIR
 fi
 
 echo ">>> Testing ChatGLM2 transformers API"

From 04333ae142a328063af81cf3f95915c8a8109830 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 26 Apr 2024 16:05:31 -0700
Subject: [PATCH 19/57] update

---
 python/llm/test/inference/test_transformers_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index 3fadd390aa3..ad1d1b53f3a 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -35,7 +35,7 @@ def setUp(self):
             self.n_threads = 2
 
     def test_transformers_auto_model_int4(self):
-        model_path = os.environ.get('ORIGINAL_CODESHELL_7B_PATH')
+        model_path = os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')
         model = AutoModel.from_pretrained(model_path, trust_remote_code=True, load_in_4bit=True)
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         input_str = "Tell me the capital of France.\n\n"

From 8ecdeac9e5891c04d282046c27f4fb0aa2e8aa16 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 26 Apr 2024 17:29:38 -0700
Subject: [PATCH 20/57] set safe_serialization false

---
 python/llm/src/ipex_llm/optimize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/optimize.py b/python/llm/src/ipex_llm/optimize.py
index d69895ec2c2..86db591ca9b 100644
--- a/python/llm/src/ipex_llm/optimize.py
+++ b/python/llm/src/ipex_llm/optimize.py
@@ -47,7 +47,8 @@ def _save_low_bit(self, save_dir, *args, **kwargs):
     if isinstance(self, PreTrainedModel):
         # We borrowed this method to adapt to Transformer model cases
         # as much as possible, and later we may merge these two situations
-        self.save_pretrained(save_dir)
+        kwargs['safe_serialization'] = False
+        self.save_pretrained(save_dir, *args, **kwargs)
     else:
         # TODO: For the lowbit model still larger than 8GB,
         #       save it into shards.

From 49a6933d906e5973253bcdfb1b38c00b231798b7 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 30 Apr 2024 15:53:37 -0700
Subject: [PATCH 21/57] perf test

---
 .github/workflows/llm_performance_tests.yml   | 466 +++++++++---------
 .../llm/test/benchmark/arc-perf-test-436.yaml |  20 +
 python/llm/test/benchmark/arc-perf-test.yaml  |  10 +-
 .../llm/test/benchmark/cpu-perf-test-436.yaml |  26 +
 python/llm/test/benchmark/cpu-perf-test.yaml  |   8 +-
 .../test/benchmark/igpu-perf/1024-128.yaml    |   1 +
 .../igpu-perf/1024-128_int4_fp16.yaml         |   1 +
 .../igpu-perf/1024-128_loadlowbit.yaml        |   1 +
 .../test/benchmark/igpu-perf/2048-256.yaml    |   1 +
 .../llm/test/benchmark/igpu-perf/32-32.yaml   |   1 +
 10 files changed, 303 insertions(+), 232 deletions(-)
 create mode 100644 python/llm/test/benchmark/arc-perf-test-436.yaml
 create mode 100644 python/llm/test/benchmark/cpu-perf-test-436.yaml

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 48cc7dc763d..97f4f522609 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -11,25 +11,25 @@ permissions:
 # Controls when the action will run.
 on:
   schedule:
-    - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
+   # - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
   # please uncomment it for PR tests
-  # pull_request:
-  #   branches: [main]
-  #   paths:
-  #     - ".github/workflows/llm_performance_tests.yml"
-  #     - "python/llm/test/benchmark/**"
-  #     - "python/llm/dev/benchmark/all-in-one/**"
+   pull_request:
+     branches: [main]
+     paths:
+       - ".github/workflows/llm_performance_tests.yml"
+       - "python/llm/test/benchmark/**"
+       - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
-  # llm-cpp-build: # please uncomment it for PR tests
-  #   uses: ./.github/workflows/llm-binary-build.yml
+  llm-cpp-build: # please uncomment it for PR tests
+    uses: ./.github/workflows/llm-binary-build.yml
 
   llm-performance-test-on-arc:
-    if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    # needs: llm-cpp-build # please uncomment it for PR tests
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -63,23 +63,23 @@ jobs:
           python -m pip install --upgrade tiktoken
 
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      # - name: Download llm binary
-      #   uses: ./.github/actions/llm/download-llm-binary
-
-      # - name: Run LLM install (all) test
-      #   uses: ./.github/actions/llm/setup-llm-env
-      #   with:
-      #     extra-dependency: "xpu_2.1"
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
 
-      - name: Install IPEX-LLM from Pypi
-        shell: bash
-        run: |
-          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          if ! pip show ipex-llm | grep $test_version_date; then
-            echo "Did not install ipex-llm with excepted version $test_version_date"
-            exit 1
-          fi
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+        with:
+          extra-dependency: "xpu_2.1"
+
+      #- name: Install IPEX-LLM from Pypi
+      #  shell: bash
+      #  run: |
+      #    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+      #    if ! pip show ipex-llm | grep $test_version_date; then
+      #      echo "Did not install ipex-llm with excepted version $test_version_date"
+      #      exit 1
+      #    fi
 
       - name: Test installed xpu version
         shell: bash
@@ -87,12 +87,11 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           bash python/llm/test/run-llm-install-tests.sh
 
-      - name: Test on xpu(transformers==4.31.0)
+      - name: Test on xpu(transformers==4.36.2)
         shell: bash
         run: |
           date_for_test_version=$(date -d yesterday +%Y-%m-%d)
           sed -i "s/date.today()/\"$date_for_test_version\"/g" python/llm/dev/benchmark/all-in-one/run.py
-
           source /opt/intel/oneapi/setvars.sh
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
@@ -103,21 +102,28 @@ jobs:
           # change csv name
           sed -i 's/{today}/{today}_test1/g' run.py
           python run.py
-
-      - name: Test on xpu(transformers==4.34.0)
-        shell: bash
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          export USE_XETLA=OFF
-          export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-          # upgrade transformers for model Mistral-7B-v0.1
-          python -m pip install transformers==4.34.0
-          cp python/llm/test/benchmark/arc-perf-transformers-434.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+          # run updated models for 4.36
+          cd -
+          cp python/llm/test/benchmark/arc-perf-test-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
           # change csv name
           sed -i 's/test1/test2/g' run.py
           python run.py
 
+      #- name: Test on xpu(transformers==4.34.0)
+      #  shell: bash
+       # run: |
+      #   source /opt/intel/oneapi/setvars.sh
+      #    export USE_XETLA=OFF
+      #    export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+          # upgrade transformers for model Mistral-7B-v0.1
+      #    python -m pip install transformers==4.34.0
+      #    cp python/llm/test/benchmark/arc-perf-transformers-434.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+      #    cd python/llm/dev/benchmark/all-in-one
+          # change csv name
+      #    sed -i 's/test1/test2/g' run.py
+      #    python run.py
+
       - name: Test on xpu(transformers==4.37.0)
         shell: bash
         run: |
@@ -151,15 +157,17 @@ jobs:
         run: |
           cd python/llm/dev/benchmark/all-in-one
           python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
-          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml
+          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-test-436.yaml
+          python ../../../test/benchmark/check_results.py -c test3 -y ../../../test/benchmark/arc-perf-test-transformers-437.yaml
+          # python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml
           find . -name "*test*.csv" -delete
           if [ ${{ github.event.schedule}} ]; then
             curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
           fi
           
   llm-performance-test-on-spr:
-    if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-spr' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    # needs: llm-cpp-build # please uncomment it for PR tests
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-spr' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -189,21 +197,21 @@ jobs:
           python -m pip install --upgrade transformers_stream_generator
 
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      # - name: Download llm binary
-      #   uses: ./.github/actions/llm/download-llm-binary
-
-      # - name: Run LLM install (all) test
-      #   uses: ./.github/actions/llm/setup-llm-env
-
-      - name: Install IPEX-LLM from Pypi
-        shell: bash
-        run: |
-          pip install --pre --upgrade ipex-llm[all] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          if ! pip show ipex-llm | grep $test_version_date; then
-            echo "Did not install ipex-llm with excepted version $test_version_date"
-            exit 1
-          fi
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+
+      # - name: Install IPEX-LLM from Pypi
+      #  shell: bash
+      #  run: |
+      #    pip install --pre --upgrade ipex-llm[all] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+      #    if ! pip show ipex-llm | grep $test_version_date; then
+      #      echo "Did not install ipex-llm with excepted version $test_version_date"
+      #      exit 1
+      #    fi
 
       - name: Test on cpu
         shell: bash
@@ -219,7 +227,18 @@ jobs:
           export OMP_NUM_THREADS=48
           # hide time info
           sed -i 's/str(end - st)/"xxxxxx"/g' run.py
+          # change csv name
+          sed -i 's/{today}/{today}_test1/g' run.py
+          python run.py
+          # run updated models for 4.36
+          cd -
+          cp python/llm/test/benchmark/cpu-perf-test-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml
+          cd python/llm/dev/benchmark/all-in-one
+          # change csv name
+          sed -i 's/test1/test2/g' run.py
           python run.py
+          python ../../../test/benchmark/concat_csv.py
+          find . -name "*test*.csv" -delete
           cp ./*.csv /mnt/disk1/models/nightly_perf_cpu
           cd ../../../test/benchmark
           python -m pip install pandas==1.5.3
@@ -230,8 +249,8 @@ jobs:
           done
 
   llm-performance-test-on-core:
-    if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-core' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    # needs: llm-cpp-build # please uncomment it for PR tests
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-core' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -263,21 +282,21 @@ jobs:
           python -m pip install --upgrade tiktoken einops transformers_stream_generator
     
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      # - name: Download llm binary
-      #   uses: ./.github/actions/llm/download-llm-binary
-
-      # - name: Run LLM install (all) test
-      #   uses: ./.github/actions/llm/setup-llm-env
-
-      - name: Install IPEX-LLM from Pypi
-        shell: bash
-        run: |
-          pip install --pre --upgrade ipex-llm[all] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          if ! pip show ipex-llm | grep $test_version_date; then
-            echo "Did not install ipex-llm with excepted version $test_version_date"
-            exit 1
-          fi
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
+
+      - name: Run LLM install (all) test
+        uses: ./.github/actions/llm/setup-llm-env
+
+      # - name: Install IPEX-LLM from Pypi
+      #  shell: bash
+      #  run: |
+      #    pip install --pre --upgrade ipex-llm[all] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+      #    if ! pip show ipex-llm | grep $test_version_date; then
+      #      echo "Did not install ipex-llm with excepted version $test_version_date"
+      #      exit 1
+      #    fi
 
       - name: Test on core ${{ matrix.platform }}
         shell: bash
@@ -302,7 +321,7 @@ jobs:
           fi
 
   llm-performance-test-on-igpu:
-    if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-igpu' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-igpu' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
     # needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
@@ -319,44 +338,16 @@ jobs:
       # TODO: Put the ipex-llm related install process for win gpu into a action function
 
       # Please uncomment it and commment the install from pypi for PR tests
-      # - name: Download llm binary
-      #   uses: ./.github/actions/llm/download-llm-binary
-
-      # - name: Prepare for install ipex-llm from source
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/"bigdl-core-xe-21==" + VERSION + "/"bigdl-core-xe-21/g' python/llm/setup.py
-      #     sed -i 's/"bigdl-core-xe-21==" + VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
-
-      # - name: Install ipex-llm and other related packages (install from source)
-      #   shell: cmd
-      #   run: |
-      #     call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
-      #     call conda activate igpu-perf
-
-      #     pip install --upgrade pip
-      #     pip install --upgrade wheel
-      #     pip install --upgrade omegaconf pandas
-      #     pip install --upgrade tiktoken einops transformers_stream_generator
-
-      #     cd python\llm
-      #     python setup.py clean --all bdist_wheel --win
-      #     if not exist dist\ipex_llm*.whl (exit /b 1)
-      #     for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
 
-      #     pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     pip list
-
-      #     call conda deactivate
-
-      - name: Determine desired ipex-llm version
+      - name: Prepare for install ipex-llm from source
         shell: bash
         run: |
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
+          sed -i 's/"bigdl-core-xe-21==" + VERSION + "/"bigdl-core-xe-21/g' python/llm/setup.py
+          sed -i 's/"bigdl-core-xe-21==" + VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
 
-      - name: Install ipex-llm and other related packages (install from pypi)
+      - name: Install ipex-llm and other related packages (install from source)
         shell: cmd
         run: |
           call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
@@ -367,16 +358,45 @@ jobs:
           pip install --upgrade omegaconf pandas
           pip install --upgrade tiktoken einops transformers_stream_generator
 
-          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          pip show ipex-llm | findstr %TEST_VERSION_DATE%
-          if %ERRORLEVEL% neq 0 (
-            echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
-            exit /b 1
-          )
+          cd python\llm
+          python setup.py clean --all bdist_wheel --win
+          if not exist dist\ipex_llm*.whl (exit /b 1)
+          for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
+
+          pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          pip install transformers==4.36.2
           pip list
 
           call conda deactivate
 
+      #- name: Determine desired ipex-llm version
+      #  shell: bash
+      #  run: |
+      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+      #    echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
+
+      #- name: Install ipex-llm and other related packages (install from pypi)
+      #  shell: cmd
+      #  run: |
+      #    call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
+      #    call conda activate igpu-perf
+
+      #    pip install --upgrade pip
+      #    pip install --upgrade wheel
+      #    pip install --upgrade omegaconf pandas
+      #    pip install --upgrade tiktoken einops transformers_stream_generator
+
+      #    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #    pip show ipex-llm | findstr %TEST_VERSION_DATE%
+      #    if %ERRORLEVEL% neq 0 (
+      #      echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
+      #      exit /b 1
+      #    )
+      #    pip list
+
+      #    call conda deactivate
+
       - name: Create env for html generation
         shell: cmd
         run: |
@@ -427,34 +447,34 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (32-32)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_434.yaml
+      #- name: Prepare igpu perf test for Mistral (32-32)
+      #  shell: bash
+      #  run: |
+      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_434.yaml
 
-      - name: Test on igpu for Mistral (32-32)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
+      #- name: Test on igpu for Mistral (32-32)
+      #  shell: cmd
+      #  run: |
+      #    call conda activate igpu-perf
+      #    pip install transformers==4.34.0
 
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+      #    set SYCL_CACHE_PERSISTENT=1
+      #    set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\32-32_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #    cd python\llm\dev\benchmark\all-in-one
+      #    move ..\..\..\test\benchmark\igpu-perf\32-32_434.yaml config.yaml
+      #    set PYTHONIOENCODING=utf-8
+      #    python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
+      #    if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          call conda deactivate
+      #    call conda deactivate
 
       - name: Prepare igpu perf test for Qwen1.5 (32-32)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_437.yaml
 
       - name: Test on igpu for Qwen1.5 (32-32)
@@ -498,14 +518,14 @@ jobs:
         shell: bash
         run: |
           sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128.yaml
 
       - name: Test on igpu (1024-128)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
           set SYCL_CACHE_PERSISTENT=1
@@ -521,34 +541,34 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (1024-128)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
+      #- name: Prepare igpu perf test for Mistral (1024-128)
+      #  shell: bash
+      #  run: |
+      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
 
-      - name: Test on igpu for Mistral (1024-128)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
+      #- name: Test on igpu for Mistral (1024-128)
+      #  shell: cmd
+      #  run: |
+      #    call conda activate igpu-perf
+      #    pip install transformers==4.34.0
 
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+      #    set SYCL_CACHE_PERSISTENT=1
+      #    set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #    cd python\llm\dev\benchmark\all-in-one
+      #    move ..\..\..\test\benchmark\igpu-perf\1024-128_434.yaml config.yaml
+      #    set PYTHONIOENCODING=utf-8
+      #    python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
+      #    if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          call conda deactivate
+      #    call conda deactivate
 
       - name: Prepare igpu perf test for Qwen 1.5 (1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (1024-128)
@@ -591,14 +611,14 @@ jobs:
         shell: bash
         run: |
           sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256.yaml
 
       - name: Test on igpu (2048-256)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
           set SYCL_CACHE_PERSISTENT=1
@@ -614,34 +634,34 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (2048-256)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
+      #- name: Prepare igpu perf test for Mistral (2048-256)
+      #  shell: bash
+      #  run: |
+      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
 
-      - name: Test on igpu for Mistral (2048-256)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
+      #- name: Test on igpu for Mistral (2048-256)
+      #  shell: cmd
+      #  run: |
+      #    call conda activate igpu-perf
+      #    pip install transformers==4.34.0
 
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+      #    set SYCL_CACHE_PERSISTENT=1
+      #    set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\2048-256_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\2048-256\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #    cd python\llm\dev\benchmark\all-in-one
+      #    move ..\..\..\test\benchmark\igpu-perf\2048-256_434.yaml config.yaml
+      #    set PYTHONIOENCODING=utf-8
+      #    python run.py >> %CSV_SAVE_PATH%\2048-256\log\%LOG_FILE% 2>&1
+      #    if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          call conda deactivate
+      #    call conda deactivate
 
       - name: Prepare igpu perf test for Qwen 1.5 (2048-256)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (2048-256)
@@ -684,14 +704,14 @@ jobs:
         shell: bash
         run: |
           sed -i 's/2048-256/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
 
       - name: Test on igpu (load_low_bit 1024-128)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
           set SYCL_CACHE_PERSISTENT=1
@@ -707,34 +727,34 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (load_low_bit 1024-128)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
+      #- name: Prepare igpu perf test for Mistral (load_low_bit 1024-128)
+      #  shell: bash
+      #  run: |
+      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
 
-      - name: Test on igpu for Mistral (load_low_bit 1024-128)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
+      #- name: Test on igpu for Mistral (load_low_bit 1024-128)
+      #  shell: cmd
+      #  run: |
+      #    call conda activate igpu-perf
+      #    pip install transformers==4.34.0
 
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+      #    set SYCL_CACHE_PERSISTENT=1
+      #    set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_loadlowbit_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_loadlowbit\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #    cd python\llm\dev\benchmark\all-in-one
+      #    move ..\..\..\test\benchmark\igpu-perf\1024-128_loadlowbit_434.yaml config.yaml
+      #    set PYTHONIOENCODING=utf-8
+      #    python run.py >> %CSV_SAVE_PATH%\1024-128_loadlowbit\log\%LOG_FILE% 2>&1
+      #    if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          call conda deactivate
+      #    call conda deactivate
 
       - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (load_low_bit 1024-128)
@@ -775,14 +795,14 @@ jobs:
       - name: Prepare igpu perf test (int4+fp16 1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test3/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test2/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
 
       - name: Test on igpu (int4+fp16 1024-128)
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.31.0
+          pip install transformers==4.36.2
 
           call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
           set SYCL_CACHE_PERSISTENT=1
@@ -798,34 +818,34 @@ jobs:
 
           call conda deactivate
 
-      - name: Prepare igpu perf test for Mistral (int4+fp16 1024-128)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
+      #- name: Prepare igpu perf test for Mistral (int4+fp16 1024-128)
+      #  shell: bash
+      #  run: |
+      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
 
-      - name: Test on igpu for Mistral (int4+fp16 1024-128)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.34.0
+      #- name: Test on igpu for Mistral (int4+fp16 1024-128)
+      #  shell: cmd
+      #  run: |
+      #    call conda activate igpu-perf
+      #    pip install transformers==4.34.0
 
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+      #    set SYCL_CACHE_PERSISTENT=1
+      #    set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_434.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #    cd python\llm\dev\benchmark\all-in-one
+      #    move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_434.yaml config.yaml
+      #    set PYTHONIOENCODING=utf-8
+      #    python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+      #    if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          call conda deactivate
+      #    call conda deactivate
 
       - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128)
         shell: bash
         run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
           sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_437.yaml
 
       - name: Test on igpu for Qwen 1.5 (int4+fp16 1024-128)
diff --git a/python/llm/test/benchmark/arc-perf-test-436.yaml b/python/llm/test/benchmark/arc-perf-test-436.yaml
new file mode 100644
index 00000000000..409834daa4f
--- /dev/null
+++ b/python/llm/test/benchmark/arc-perf-test-436.yaml
@@ -0,0 +1,20 @@
+repo_id:
+  - 'THUDM/chatglm2-6b_for_4.36'
+  - 'mosaicml/mpt-7b-chat_for_4.36'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
+local_model_hub: '/mnt/disk0/models'
+warm_up: 1
+num_trials: 3
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: 1 # default to 1
+in_out_pairs:
+  - '32-32'
+  - '1024-128'
+  - '2048-256'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 47f74b20e7e..07f0633fcd9 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -1,10 +1,10 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm2-6b'
   - 'THUDM/chatglm3-6b-4bit'
   - 'tiiuae/falcon-7b-instruct-with-patch'
-  - 'mosaicml/mpt-7b-chat'
+#  - 'mosaicml/mpt-7b-chat'
   - 'redpajama/gptneox-7b-redpajama-bf16'
   - 'bigcode/starcoder-15.5b-4bit'
   - 'databricks/dolly-v1-6b'
@@ -13,10 +13,11 @@ repo_id:
   - 'internlm/internlm-chat-7b-8k'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat-7B'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
   - 'bigscience/bloomz-7b1'
   - 'fnlp/moss-moon-003-sft-4bit'
+  - 'mistralai/Mistral-7B-v0.1'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
 num_trials: 3
@@ -33,5 +34,4 @@ cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu w
 exclude:
   - 'fnlp/moss-moon-003-sft-4bit:1024'
   - 'fnlp/moss-moon-003-sft-4bit:2048'
-  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
   - 'bigscience/bloomz-7b1:2048'
diff --git a/python/llm/test/benchmark/cpu-perf-test-436.yaml b/python/llm/test/benchmark/cpu-perf-test-436.yaml
new file mode 100644
index 00000000000..c7ba0c1ded8
--- /dev/null
+++ b/python/llm/test/benchmark/cpu-perf-test-436.yaml
@@ -0,0 +1,26 @@
+repo_id:
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat'
+local_model_hub: '/mnt/disk1/models/updated_for_4.36'
+warm_up: 1
+num_trials: 3
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: 1 # default to 1
+in_out_pairs:
+  - '32-32'
+  - '1024-128'
+  - '2048-256'
+test_api:
+  - "transformer_int4"
+  # - "native_int4"
+  # - "optimize_model"
+  # - "pytorch_autocast_bf16"
+  # - "ipex_fp16_gpu" # on Intel GPU
+  # - "transformer_int4_gpu"  # on Intel GPU
+  # - "optimize_model_gpu"  # on Intel GPU
+  # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
+  # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/cpu-perf-test.yaml b/python/llm/test/benchmark/cpu-perf-test.yaml
index 92b12750dbb..c65e3f7618e 100644
--- a/python/llm/test/benchmark/cpu-perf-test.yaml
+++ b/python/llm/test/benchmark/cpu-perf-test.yaml
@@ -1,10 +1,10 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'baichuan-inc/Baichuan2-13B-Chat'
+#  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm3-6b_for_4.36'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
index 0bc604795b5..b7975c1d0d1 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -16,6 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
index 0ac8951657c..73eced1e400 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -16,6 +16,7 @@ repo_id:
   # - 'RWKV/rwkv-4-world-7b'
   # - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
index a8dd75eac73..f3e72dd3529 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
@@ -16,6 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256.yaml b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
index 5aa790150e6..4edfcf581e4 100644
--- a/python/llm/test/benchmark/igpu-perf/2048-256.yaml
+++ b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
@@ -16,6 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/32-32.yaml b/python/llm/test/benchmark/igpu-perf/32-32.yaml
index 6f4fd2f0b49..dc6ce9bc6bc 100644
--- a/python/llm/test/benchmark/igpu-perf/32-32.yaml
+++ b/python/llm/test/benchmark/igpu-perf/32-32.yaml
@@ -16,6 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 3
 num_trials: 5

From 92176620f618357c9d8afcf09d8ad19d6f93046f Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 30 Apr 2024 16:10:52 -0700
Subject: [PATCH 22/57] update

---
 .github/workflows/llm_performance_tests.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 248380249c8..1cd670b0383 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -11,14 +11,14 @@ permissions:
 # Controls when the action will run.
 on:
   schedule:
-   # - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
+    - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
   # please uncomment it for PR tests
-   pull_request:
-     branches: [main]
-     paths:
-       - ".github/workflows/llm_performance_tests.yml"
-       - "python/llm/test/benchmark/**"
-       - "python/llm/dev/benchmark/all-in-one/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm_performance_tests.yml"
+      - "python/llm/test/benchmark/**"
+      - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 

From 3ad25b76917c96d3fc88b89dba40ef4cd1c35ac3 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 30 Apr 2024 17:22:58 -0700
Subject: [PATCH 23/57] update

---
 python/llm/test/benchmark/arc-perf-test-436.yaml             | 1 +
 python/llm/test/benchmark/arc-perf-test.yaml                 | 2 +-
 python/llm/test/benchmark/core-perf-test.yaml                | 2 +-
 python/llm/test/benchmark/cpu-perf-test.yaml                 | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128.yaml            | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml  | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml | 2 +-
 python/llm/test/benchmark/igpu-perf/2048-256.yaml            | 2 +-
 python/llm/test/benchmark/igpu-perf/32-32.yaml               | 2 +-
 9 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/llm/test/benchmark/arc-perf-test-436.yaml b/python/llm/test/benchmark/arc-perf-test-436.yaml
index 409834daa4f..bebc7a181ac 100644
--- a/python/llm/test/benchmark/arc-perf-test-436.yaml
+++ b/python/llm/test/benchmark/arc-perf-test-436.yaml
@@ -3,6 +3,7 @@ repo_id:
   - 'mosaicml/mpt-7b-chat_for_4.36'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
+  - 'internlm/internlm-chat-7b'
 local_model_hub: '/mnt/disk0/models'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 07f0633fcd9..c16a8c014f5 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -10,7 +10,7 @@ repo_id:
   - 'databricks/dolly-v1-6b'
   - 'databricks/dolly-v2-7b'
   - 'databricks/dolly-v2-12b'
-  - 'internlm/internlm-chat-7b-8k'
+#  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat-7B'
 #  - 'baichuan-inc/Baichuan2-7B-Chat'
diff --git a/python/llm/test/benchmark/core-perf-test.yaml b/python/llm/test/benchmark/core-perf-test.yaml
index e922d21886e..049a807bd76 100644
--- a/python/llm/test/benchmark/core-perf-test.yaml
+++ b/python/llm/test/benchmark/core-perf-test.yaml
@@ -2,7 +2,7 @@ repo_id:
   - 'THUDM/chatglm2-6b'
   - 'THUDM/chatglm3-6b'
   - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'internlm/internlm-chat-7b-8k'
+#  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat2-7B'
   - 'meta-llama/Llama-2-7b-chat-hf'
diff --git a/python/llm/test/benchmark/cpu-perf-test.yaml b/python/llm/test/benchmark/cpu-perf-test.yaml
index c65e3f7618e..6095e2ed805 100644
--- a/python/llm/test/benchmark/cpu-perf-test.yaml
+++ b/python/llm/test/benchmark/cpu-perf-test.yaml
@@ -2,7 +2,7 @@ repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
 #  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b_for_4.36'
+#  - 'THUDM/chatglm3-6b'
 #  - 'baichuan-inc/Baichuan2-7B-Chat'
 #  - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
index b7975c1d0d1..0b53b28a1f5 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -3,7 +3,7 @@ repo_id:
   - 'THUDM/chatglm3-6b'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  - 'internlm/internlm-chat-7b-8k'
+#  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat2-7B'
   # - '01-ai/Yi-6B'
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
index 73eced1e400..85a0c2fb038 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -3,7 +3,7 @@ repo_id:
   - 'THUDM/chatglm3-6b'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  - 'internlm/internlm-chat-7b-8k'
+#  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat2-7B'
   # - '01-ai/Yi-6B'
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
index f3e72dd3529..f941fcd8a5f 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
@@ -3,7 +3,7 @@ repo_id:
   - 'THUDM/chatglm3-6b'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  - 'internlm/internlm-chat-7b-8k'
+#  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat2-7B'
   # - '01-ai/Yi-6B'
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256.yaml b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
index 4edfcf581e4..3b03fd554cf 100644
--- a/python/llm/test/benchmark/igpu-perf/2048-256.yaml
+++ b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
@@ -3,7 +3,7 @@ repo_id:
   - 'THUDM/chatglm3-6b'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  - 'internlm/internlm-chat-7b-8k'
+#  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat2-7B'
   # - '01-ai/Yi-6B'
diff --git a/python/llm/test/benchmark/igpu-perf/32-32.yaml b/python/llm/test/benchmark/igpu-perf/32-32.yaml
index dc6ce9bc6bc..84ad699369c 100644
--- a/python/llm/test/benchmark/igpu-perf/32-32.yaml
+++ b/python/llm/test/benchmark/igpu-perf/32-32.yaml
@@ -3,7 +3,7 @@ repo_id:
   - 'THUDM/chatglm3-6b'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat'
-  - 'internlm/internlm-chat-7b-8k'
+#  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat2-7B'
   # - '01-ai/Yi-6B'

From 8ee92d2e0319f181bb506009c4f68a2deda11351 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 30 Apr 2024 23:27:02 -0700
Subject: [PATCH 24/57] update

---
 .github/workflows/llm_performance_tests.yml  | 4 ++--
 python/llm/test/benchmark/arc-perf-test.yaml | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 1cd670b0383..bb5678931d9 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -344,8 +344,8 @@ jobs:
       - name: Prepare for install ipex-llm from source
         shell: bash
         run: |
-          sed -i 's/"bigdl-core-xe-21==" + VERSION + "/"bigdl-core-xe-21/g' python/llm/setup.py
-          sed -i 's/"bigdl-core-xe-21==" + VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
+          sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21/g' python/llm/setup.py
+          sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
 
       - name: Install ipex-llm and other related packages (install from source)
         shell: cmd
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index c16a8c014f5..7552b7f84f1 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -16,7 +16,7 @@ repo_id:
 #  - 'baichuan-inc/Baichuan2-7B-Chat'
 #  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
   - 'bigscience/bloomz-7b1'
-  - 'fnlp/moss-moon-003-sft-4bit'
+#  - 'fnlp/moss-moon-003-sft-4bit'
   - 'mistralai/Mistral-7B-v0.1'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
@@ -32,6 +32,6 @@ test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
-  - 'fnlp/moss-moon-003-sft-4bit:1024'
-  - 'fnlp/moss-moon-003-sft-4bit:2048'
+#  - 'fnlp/moss-moon-003-sft-4bit:1024'
+#  - 'fnlp/moss-moon-003-sft-4bit:2048'
   - 'bigscience/bloomz-7b1:2048'

From 45d23834dcf5438de753390c85ecb16cb920941d Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 30 Apr 2024 23:52:56 -0700
Subject: [PATCH 25/57] update

---
 .github/workflows/llm_performance_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index bb5678931d9..d27f318786e 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -344,7 +344,7 @@ jobs:
       - name: Prepare for install ipex-llm from source
         shell: bash
         run: |
-          sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21/g' python/llm/setup.py
+          sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
           sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
 
       - name: Install ipex-llm and other related packages (install from source)

From e968252c3c58d13450d5987a3ad6c5b4a1fd045a Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 1 May 2024 10:24:27 -0700
Subject: [PATCH 26/57] update

---
 python/llm/test/benchmark/arc-perf-test-436.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/llm/test/benchmark/arc-perf-test-436.yaml b/python/llm/test/benchmark/arc-perf-test-436.yaml
index bebc7a181ac..1b31cfa2aad 100644
--- a/python/llm/test/benchmark/arc-perf-test-436.yaml
+++ b/python/llm/test/benchmark/arc-perf-test-436.yaml
@@ -1,6 +1,6 @@
 repo_id:
-  - 'THUDM/chatglm2-6b_for_4.36'
-  - 'mosaicml/mpt-7b-chat_for_4.36'
+  - 'THUDM/chatglm2-6b'
+  - 'mosaicml/mpt-7b-chat'
   - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
   - 'internlm/internlm-chat-7b'

From d59f68c0480f45bc16872c91bf529a89498ab01d Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 1 May 2024 15:37:35 -0700
Subject: [PATCH 27/57] update

---
 python/llm/test/benchmark/arc-perf-test-436.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/llm/test/benchmark/arc-perf-test-436.yaml b/python/llm/test/benchmark/arc-perf-test-436.yaml
index 1b31cfa2aad..f619987bfc9 100644
--- a/python/llm/test/benchmark/arc-perf-test-436.yaml
+++ b/python/llm/test/benchmark/arc-perf-test-436.yaml
@@ -19,3 +19,4 @@ test_api:
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
   - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
+  - 'mosaicml/mpt-7b-chat:2048'

From f44e9a4aa1de8ceb92a344e0d46cdf1a53bc0a84 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 1 May 2024 16:33:02 -0700
Subject: [PATCH 28/57] update

---
 .github/workflows/llm_performance_tests.yml | 83 +++++++++++----------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index d27f318786e..337f8361b28 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -158,7 +158,7 @@ jobs:
           cd python/llm/dev/benchmark/all-in-one
           python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
           python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-test-436.yaml
-          python ../../../test/benchmark/check_results.py -c test3 -y ../../../test/benchmark/arc-perf-test-transformers-437.yaml
+          python ../../../test/benchmark/check_results.py -c test3 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
           # python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml
           find . -name "*test*.csv" -delete
           if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
@@ -338,16 +338,45 @@ jobs:
       # TODO: Put the ipex-llm related install process for win gpu into a action function
 
       # Please uncomment it and commment the install from pypi for PR tests
-      - name: Download llm binary
-        uses: ./.github/actions/llm/download-llm-binary
+      #- name: Download llm binary
+      #  uses: ./.github/actions/llm/download-llm-binary
+
+      #- name: Prepare for install ipex-llm from source
+      #  shell: bash
+      #  run: |
+      #    sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
+      #    sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
+
+      #- name: Install ipex-llm and other related packages (install from source)
+      #  shell: cmd
+      #  run: |
+      #    call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
+      #    call conda activate igpu-perf
+
+      #    pip install --upgrade pip
+      #    pip install --upgrade wheel
+      #    pip install --upgrade omegaconf pandas
+      #    pip install --upgrade tiktoken einops transformers_stream_generator
+
+      #    cd python\llm
+      #    python setup.py clean --all bdist_wheel --win
+      #    if not exist dist\ipex_llm*.whl (exit /b 1)
+      #    for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
 
-      - name: Prepare for install ipex-llm from source
+      #    pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #    if %ERRORLEVEL% neq 0 (exit /b 1)
+      #    pip install transformers==4.36.2
+      #    pip list
+
+      #    call conda deactivate
+
+      - name: Determine desired ipex-llm version
         shell: bash
         run: |
-          sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
-          sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
+          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+          echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
 
-      - name: Install ipex-llm and other related packages (install from source)
+      - name: Install ipex-llm and other related packages (install from pypi)
         shell: cmd
         run: |
           call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
@@ -358,45 +387,17 @@ jobs:
           pip install --upgrade omegaconf pandas
           pip install --upgrade tiktoken einops transformers_stream_generator
 
-          cd python\llm
-          python setup.py clean --all bdist_wheel --win
-          if not exist dist\ipex_llm*.whl (exit /b 1)
-          for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
-
-          pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+          pip show ipex-llm | findstr %TEST_VERSION_DATE%
+          if %ERRORLEVEL% neq 0 (
+            echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
+            exit /b 1
+          )
           pip install transformers==4.36.2
           pip list
 
           call conda deactivate
 
-      #- name: Determine desired ipex-llm version
-      #  shell: bash
-      #  run: |
-      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-      #    echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
-
-      #- name: Install ipex-llm and other related packages (install from pypi)
-      #  shell: cmd
-      #  run: |
-      #    call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
-      #    call conda activate igpu-perf
-
-      #    pip install --upgrade pip
-      #    pip install --upgrade wheel
-      #    pip install --upgrade omegaconf pandas
-      #    pip install --upgrade tiktoken einops transformers_stream_generator
-
-      #    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-      #    pip show ipex-llm | findstr %TEST_VERSION_DATE%
-      #    if %ERRORLEVEL% neq 0 (
-      #      echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
-      #      exit /b 1
-      #    )
-      #    pip list
-
-      #    call conda deactivate
-
       - name: Create env for html generation
         shell: cmd
         run: |

From f9ece00d1cef91cc29f05e9454fd8e412e1a63c3 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 1 May 2024 16:35:59 -0700
Subject: [PATCH 29/57] update

---
 .github/workflows/llm_performance_tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 337f8361b28..b687c3a2ffc 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -443,6 +443,7 @@ jobs:
           cd python\llm\dev\benchmark\all-in-one
           move ..\..\..\test\benchmark\igpu-perf\32-32.yaml config.yaml
           set PYTHONIOENCODING=utf-8
+          python run.py
           python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
           if %ERRORLEVEL% neq 0 (exit /b 1)
 

From bf8aecea62a26164e198f3752ab4b34236bf1a55 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 1 May 2024 17:42:26 -0700
Subject: [PATCH 30/57] update

---
 python/llm/dev/benchmark/all-in-one/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 31c3ecee8b2..5eab763187c 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -844,7 +844,7 @@ def run_transformer_int4_gpu_win(repo_id,
     elif repo_id in LLAVA_IDS:
         llava_repo_dir = os.environ.get('LLAVA_REPO_DIR')
         sys.path.append(rf"{llava_repo_dir}")
-        from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
+        # from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
                                           trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

From 98789dbbed5a31e62b5a144193b3562d383cced8 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 2 May 2024 12:57:26 -0700
Subject: [PATCH 31/57] update

---
 python/llm/dev/benchmark/all-in-one/run.py                   | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128.yaml            | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml  | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml | 2 +-
 python/llm/test/benchmark/igpu-perf/2048-256.yaml            | 2 +-
 python/llm/test/benchmark/igpu-perf/32-32.yaml               | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 5eab763187c..31c3ecee8b2 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -844,7 +844,7 @@ def run_transformer_int4_gpu_win(repo_id,
     elif repo_id in LLAVA_IDS:
         llava_repo_dir = os.environ.get('LLAVA_REPO_DIR')
         sys.path.append(rf"{llava_repo_dir}")
-        # from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
+        from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=True,
                                           trust_remote_code=True, use_cache=True, cpu_embedding=cpu_embedding).eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
index 0b53b28a1f5..582d55e26fd 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -12,7 +12,7 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
index 85a0c2fb038..1208b1b6e63 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -12,7 +12,7 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   # - 'RWKV/rwkv-4-world-7b'
   # - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
index f941fcd8a5f..5157b56dadc 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
@@ -12,7 +12,7 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256.yaml b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
index 3b03fd554cf..d4872ad19cc 100644
--- a/python/llm/test/benchmark/igpu-perf/2048-256.yaml
+++ b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
@@ -12,7 +12,7 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
diff --git a/python/llm/test/benchmark/igpu-perf/32-32.yaml b/python/llm/test/benchmark/igpu-perf/32-32.yaml
index 84ad699369c..53f09c910d5 100644
--- a/python/llm/test/benchmark/igpu-perf/32-32.yaml
+++ b/python/llm/test/benchmark/igpu-perf/32-32.yaml
@@ -12,7 +12,7 @@ repo_id:
   - 'WisdomShell/CodeShell-7B-Chat'
   - 'tiiuae/falcon-7b-instruct-with-patch'
   - 'mosaicml/mpt-7b-chat'
-  - 'liuhaotian/llava-v1.5-7b'
+#  - 'liuhaotian/llava-v1.5-7b' # Cannot load using AutoModelForCausalLM in 4.36+
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'

From d459a8297b479973db68fb2602f77733b9d1db02 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 2 May 2024 16:09:16 -0700
Subject: [PATCH 32/57] update

---
 .github/workflows/llm_performance_tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index b687c3a2ffc..337f8361b28 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -443,7 +443,6 @@ jobs:
           cd python\llm\dev\benchmark\all-in-one
           move ..\..\..\test\benchmark\igpu-perf\32-32.yaml config.yaml
           set PYTHONIOENCODING=utf-8
-          python run.py
           python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
           if %ERRORLEVEL% neq 0 (exit /b 1)
 

From 51134d4511287c32d9a0349f2c3e3d18a0edaa02 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 3 May 2024 11:35:01 -0700
Subject: [PATCH 33/57] update

---
 .github/workflows/llm-harness-evaluation.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
index 544170195ff..29146ca116e 100644
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@@ -164,11 +164,11 @@ jobs:
         shell: bash
         run: |
           pip install --upgrade datasets==2.14.6 
-          if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
-            pip install --upgrade transformers==4.36
-          else
-            pip install --upgrade transformers==4.31
-          fi
+          #if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
+          #  pip install --upgrade transformers==4.36
+          #else
+          #  pip install --upgrade transformers==4.31
+          #fi
       
 
       - name: Run harness

From 39c104b6cca29cc28a01c0fe4df7e50ca865d37d Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 3 May 2024 14:29:54 -0700
Subject: [PATCH 34/57] update

---
 .github/actions/llm/setup-llm-env/action.yml           |  2 +-
 .github/workflows/llm-ppl-evaluation.yml               | 10 +++++-----
 python/llm/test/benchmark/arc-perf-test.yaml           |  2 +-
 python/llm/test/benchmark/igpu-perf/1024-128.yaml      |  2 +-
 .../test/benchmark/igpu-perf/1024-128_int4_fp16.yaml   |  2 +-
 .../test/benchmark/igpu-perf/1024-128_loadlowbit.yaml  |  2 +-
 python/llm/test/benchmark/igpu-perf/2048-256.yaml      |  2 +-
 python/llm/test/benchmark/igpu-perf/32-32.yaml         |  2 +-
 python/llm/test/inference/test_transformers_api.py     |  4 ++--
 .../inference_gpu/test_transformers_api_RMSNorm.py     |  2 +-
 .../inference_gpu/test_transformers_api_attention.py   |  2 +-
 .../test/inference_gpu/test_transformers_api_mlp.py    |  2 +-
 12 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/actions/llm/setup-llm-env/action.yml b/.github/actions/llm/setup-llm-env/action.yml
index 4d0b7550f74..01c2660ea8d 100644
--- a/.github/actions/llm/setup-llm-env/action.yml
+++ b/.github/actions/llm/setup-llm-env/action.yml
@@ -42,4 +42,4 @@ runs:
           pip install pytest
           bash python/llm/test/run-llm-install-tests.sh
         fi
-        pip install transformers==4.36.2
+        # pip install transformers==4.36.2
diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index 7ad621f91e3..bfab5277f80 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -149,11 +149,11 @@ jobs:
         shell: bash
         run: |
           pip install --upgrade datasets==2.14.6 
-          if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
-            pip install --upgrade transformers==4.36
-          else
-            pip install --upgrade transformers==4.31
-          fi
+          #if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
+          #  pip install --upgrade transformers==4.36
+          #else
+          #  pip install --upgrade transformers==4.31
+          #fi
 
       - name: Run perplexity
         shell: bash
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 7552b7f84f1..b08cc5290f6 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -17,7 +17,7 @@ repo_id:
 #  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
   - 'bigscience/bloomz-7b1'
 #  - 'fnlp/moss-moon-003-sft-4bit'
-  - 'mistralai/Mistral-7B-v0.1'
+#  - 'mistralai/Mistral-7B-v0.1'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
index 582d55e26fd..db7ae48114c 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
+#  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
index 1208b1b6e63..6554c68bd80 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -16,7 +16,7 @@ repo_id:
   # - 'RWKV/rwkv-4-world-7b'
   # - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
+#  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
index 5157b56dadc..231d4d780d5 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
+#  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256.yaml b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
index d4872ad19cc..c8ff077f919 100644
--- a/python/llm/test/benchmark/igpu-perf/2048-256.yaml
+++ b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
+#  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/32-32.yaml b/python/llm/test/benchmark/igpu-perf/32-32.yaml
index 53f09c910d5..f24141206c3 100644
--- a/python/llm/test/benchmark/igpu-perf/32-32.yaml
+++ b/python/llm/test/benchmark/igpu-perf/32-32.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
+#  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 3
 num_trials: 5
diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index ad1d1b53f3a..db7871fc801 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -116,7 +116,7 @@ def test_transformers_chatglm_for_causallm(self):
     ])
 @pytest.mark.parametrize('Model, Tokenizer, model_path',[
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
-    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
+    #(AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
     ])
 def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -145,7 +145,7 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_CODESHELL_7B_PATH'), prompt),
-    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
+    # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
 ])
 
 def test_optimize_model(Model, Tokenizer, model_path, prompt):
diff --git a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
index f45f017ef0b..5cec634be44 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
@@ -30,7 +30,7 @@
 TEST_MODEL_LIST = [
     ("Llama2-7B", AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH')),
     ("ChatGLM2-6B", AutoModel, AutoTokenizer, os.environ.get('CHATGLM2_6B_ORIGIN_PATH')),
-    ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
+    # ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
     ("Baichuan2-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('BAICHUAN2_7B_ORIGIN_PATH')),
     ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')),
 ]
diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py
index 83f7aaebfc8..e5c39897cec 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_attention.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py
@@ -32,7 +32,7 @@
     ("Llama2-7B", AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH')),
     ("Falcon-7B", AutoModelForCausalLM, AutoTokenizer, os.environ.get('FALCON_7B_ORIGIN_PATH')),
     ("ChatGLM2-6B", AutoModel, AutoTokenizer, os.environ.get('CHATGLM2_6B_ORIGIN_PATH')),
-    ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
+    # ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
     ("Baichuan2-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('BAICHUAN2_7B_ORIGIN_PATH')),
     ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')),
 ]
diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
index 70ba2e7b9f6..f7aba58fb72 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
@@ -28,7 +28,7 @@
 PROMPT = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
 TEST_MODEL_LIST = [
     ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')),
-    ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
+    # ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
     ("Llama2-7B", AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH'))
 ]
 

From 5d32b59fa35805be103a76b88740cf5bf423c582 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 3 May 2024 15:35:55 -0700
Subject: [PATCH 35/57] update

---
 .github/workflows/llm_unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index c77987ec783..ffcfa925359 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -389,7 +389,7 @@ jobs:
         shell: bash
         run: |
           python -m pip uninstall datasets -y
-          python -m pip install datasets peft==0.5.0 accelerate==0.23.0
+          python -m pip install transformers==4.34.0 datasets peft==0.5.0 accelerate==0.23.0
           python -m pip install bitsandbytes scipy
           # Specific oneapi position on arc ut test machines
           if [[ '${{ matrix.pytorch-version }}' == '2.1' ]]; then

From da72111f5dda6b4f39bc75de91e4ce66215cc998 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 3 May 2024 23:11:18 -0700
Subject: [PATCH 36/57] update

---
 .github/workflows/llm_unit_tests.yml     | 32 +++++++-----------------
 python/llm/dev/test/run-example-tests.sh |  2 +-
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index ffcfa925359..abf86908709 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -77,7 +77,6 @@ jobs:
         run: |
           echo "DATASET_DIR=${{ github.workspace }}/../llm/datasets" >> "$GITHUB_ENV"
           echo "ORIGIN_DIR=${{ github.workspace }}/../llm/origin-models" >> "$GITHUB_ENV"
-          echo "ORIGIN_DIR_436=${{ github.workspace }}/../llm/origin-models-4.36" >> "$GITHUB_ENV"
           echo "INT4_CKPT_DIR=${{ github.workspace }}/../llm/converted-models" >> "$GITHUB_ENV"
       - name: Create model directories
         shell: bash
@@ -88,9 +87,6 @@ jobs:
           if [ ! -d $ORIGIN_DIR ]; then
             mkdir -p $ORIGIN_DIR
           fi
-          if [ ! -d ORIGIN_DIR_436 ]; then
-            mkdir -p ORIGIN_DIR_436
-          fi
           if [ ! -d $INT4_CKPT_DIR ]; then
             mkdir -p $INT4_CKPT_DIR
           fi
@@ -102,7 +98,7 @@ jobs:
 
           echo "LLAMA_ORIGIN_PATH=${ORIGIN_DIR}/llama-7b-hf" >> "$GITHUB_ENV"
           echo "BLOOM_ORIGIN_PATH=${ORIGIN_DIR}/bloom-7b1" >> "$GITHUB_ENV"
-          echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR_436}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "ORIGINAL_CHATGLM2_6B_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
           echo "ORIGINAL_CODESHELL_7B_PATH=${ORIGIN_DIR}/CodeShell-7B-Chat" >> "$GITHUB_ENV"
           echo "ORIGINAL_WHISPER_TINY_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
           echo "MISTRAL_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-v0.1" >> "$GITHUB_ENV"
@@ -161,8 +157,8 @@ jobs:
           # fi
           if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
             echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
-            echo "wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436"
-            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
+            echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"
+            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR
           fi
           if [ ! -d $ORIGINAL_CODESHELL_7B_PATH ]; then
             echo "Directory $ORIGINAL_CODESHELL_7B_PATH not found. Downloading from FTP server..."
@@ -256,16 +252,6 @@ jobs:
       # THREAD_NUM: 16
       ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
     steps:
-      - name: Set model directories for 4.36
-        shell: bash
-        run: |
-          echo "ORIGIN_DIR_436=${{ github.workspace }}/../llm/origin-models-4.36" >> "$GITHUB_ENV"
-      - name: Create model directories
-        shell: bash
-        run: |
-          if [ ! -d ORIGIN_DIR_436 ]; then
-            mkdir -p ORIGIN_DIR_436
-          fi
       - name: Set environment variables
         shell: bash
         run: |
@@ -274,12 +260,12 @@ jobs:
           echo "SPEECH_DATASET_PATH=${ORIGIN_DIR}/../datasets/librispeech_asr_dummy" >> "$GITHUB_ENV"
 
           echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
-          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR_436}/chatglm2-6b" >> "$GITHUB_ENV"
+          echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
           echo "FALCON_7B_ORIGIN_PATH=${ORIGIN_DIR}/falcon-7b-instruct-with-patch" >> "$GITHUB_ENV"
-          echo "MPT_7B_ORIGIN_PATH=${ORIGIN_DIR_436}/mpt-7b-chat" >> "$GITHUB_ENV"
+          echo "MPT_7B_ORIGIN_PATH=${ORIGIN_DIR}/mpt-7b-chat" >> "$GITHUB_ENV"
           echo "WHISPER_TINY_ORIGIN_PATH=${ORIGIN_DIR}/whisper-tiny" >> "$GITHUB_ENV"
           echo "MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH=${ORIGIN_DIR}/Mistral-7B-Instruct-v0.1" >> "$GITHUB_ENV"
-          echo "BAICHUAN2_7B_ORIGIN_PATH=${ORIGIN_DIR_436}/Baichuan2-7B-Chat" >> "$GITHUB_ENV"
+          echo "BAICHUAN2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Baichuan2-7B-Chat" >> "$GITHUB_ENV"
           echo "QWEN_7B_ORIGIN_PATH=${ORIGIN_DIR}/Qwen-7B-Chat" >> "$GITHUB_ENV"
           echo "VICUNA_7B_1_3_ORIGIN_PATH=${ORIGIN_DIR}/vicuna-7b-v1.3" >> "$GITHUB_ENV"
       - name: Checkout repo
@@ -329,7 +315,7 @@ jobs:
           fi
           if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
             echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR_436
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
           fi
           if [ ! -d $FALCON_7B_ORIGIN_PATH ]; then
             echo "Directory $FALCON_7B_ORIGIN_PATH not found. Downloading from FTP server..."
@@ -337,7 +323,7 @@ jobs:
           fi
           if [ ! -d $MPT_7B_ORIGIN_PATH ]; then
             echo "Directory $MPT_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/mpt-7b-chat -P $ORIGIN_DIR_436
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/mpt-7b-chat -P $ORIGIN_DIR
           fi
           if [ ! -d $WHISPER_TINY_ORIGIN_PATH ]; then
             echo "Directory $WHISPER_TINY_ORIGIN_PATH not found. Downloading from FTP server..."
@@ -364,7 +350,7 @@ jobs:
           fi
           if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
             echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
-            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/Baichuan2-7B-Chat -P $ORIGIN_DIR_436
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Baichuan2-7B-Chat -P $ORIGIN_DIR
           fi
           if [ ! -d $VICUNA_7B_1_3_ORIGIN_PATH ]; then
             echo "Directory $VICUNA_7B_1_3_ORIGIN_PATH not found. Downloading from FTP server..."
diff --git a/python/llm/dev/test/run-example-tests.sh b/python/llm/dev/test/run-example-tests.sh
index 3876fa77a68..edf767c8719 100644
--- a/python/llm/dev/test/run-example-tests.sh
+++ b/python/llm/dev/test/run-example-tests.sh
@@ -68,7 +68,7 @@ fi
 export ORIGINAL_CHATGLM2_PATH=./llm/chatglm2-6b/
 if [ ! -d $ORIGINAL_CHATGLM2_PATH ]; then
     echo "Directory $ORIGINAL_CHATGLM2_PATH not found. Downloading from FTP server..."
-    wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/${ORIGINAL_CHATGLM2_PATH:6} -P $LLM_DIR
+    wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/${ORIGINAL_CHATGLM2_PATH:2} -P $LLM_DIR
 fi
 
 echo ">>> Testing ChatGLM2 transformers API"

From 687ba8b55499d99985ad9e54a3c9050164bfddb2 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Fri, 3 May 2024 23:24:16 -0700
Subject: [PATCH 37/57] update

---
 .github/actions/llm/setup-llm-env/action.yml                | 2 +-
 python/llm/test/benchmark/arc-perf-test.yaml                | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128.yaml           | 2 +-
 python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml | 2 +-
 .../llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml   | 2 +-
 python/llm/test/benchmark/igpu-perf/2048-256.yaml           | 2 +-
 python/llm/test/benchmark/igpu-perf/32-32.yaml              | 2 +-
 python/llm/test/inference/test_transformers_api.py          | 6 +++---
 .../llm/test/inference_gpu/test_transformers_api_RMSNorm.py | 2 +-
 .../test/inference_gpu/test_transformers_api_attention.py   | 2 +-
 python/llm/test/inference_gpu/test_transformers_api_mlp.py  | 2 +-
 11 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/actions/llm/setup-llm-env/action.yml b/.github/actions/llm/setup-llm-env/action.yml
index 01c2660ea8d..4d0b7550f74 100644
--- a/.github/actions/llm/setup-llm-env/action.yml
+++ b/.github/actions/llm/setup-llm-env/action.yml
@@ -42,4 +42,4 @@ runs:
           pip install pytest
           bash python/llm/test/run-llm-install-tests.sh
         fi
-        # pip install transformers==4.36.2
+        pip install transformers==4.36.2
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index b08cc5290f6..7552b7f84f1 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -17,7 +17,7 @@ repo_id:
 #  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
   - 'bigscience/bloomz-7b1'
 #  - 'fnlp/moss-moon-003-sft-4bit'
-#  - 'mistralai/Mistral-7B-v0.1'
+  - 'mistralai/Mistral-7B-v0.1'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128.yaml b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
index db7ae48114c..582d55e26fd 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-#  - 'mistralai/Mistral-7B-Instruct-v0.1'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
index 6554c68bd80..1208b1b6e63 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
@@ -16,7 +16,7 @@ repo_id:
   # - 'RWKV/rwkv-4-world-7b'
   # - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-#  - 'mistralai/Mistral-7B-Instruct-v0.1'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
index 231d4d780d5..5157b56dadc 100644
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
+++ b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-#  - 'mistralai/Mistral-7B-Instruct-v0.1'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256.yaml b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
index c8ff077f919..d4872ad19cc 100644
--- a/python/llm/test/benchmark/igpu-perf/2048-256.yaml
+++ b/python/llm/test/benchmark/igpu-perf/2048-256.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-#  - 'mistralai/Mistral-7B-Instruct-v0.1'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
 num_trials: 3
diff --git a/python/llm/test/benchmark/igpu-perf/32-32.yaml b/python/llm/test/benchmark/igpu-perf/32-32.yaml
index f24141206c3..53f09c910d5 100644
--- a/python/llm/test/benchmark/igpu-perf/32-32.yaml
+++ b/python/llm/test/benchmark/igpu-perf/32-32.yaml
@@ -16,7 +16,7 @@ repo_id:
   - 'RWKV/rwkv-4-world-7b'
   - 'RWKV/rwkv-5-world-7b'
   - 'IEITYuan/Yuan2-2B-hf'
-#  - 'mistralai/Mistral-7B-Instruct-v0.1'
+  - 'mistralai/Mistral-7B-Instruct-v0.1'
 local_model_hub: 'path to your local model hub'
 warm_up: 3
 num_trials: 5
diff --git a/python/llm/test/inference/test_transformers_api.py b/python/llm/test/inference/test_transformers_api.py
index db7871fc801..f16773c62c3 100644
--- a/python/llm/test/inference/test_transformers_api.py
+++ b/python/llm/test/inference/test_transformers_api.py
@@ -116,7 +116,7 @@ def test_transformers_chatglm_for_causallm(self):
     ])
 @pytest.mark.parametrize('Model, Tokenizer, model_path',[
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH')),
-    #(AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH')),
     ])
 def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
     tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -141,11 +141,11 @@ def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
 prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
 
 @pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [
-    # (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
+    (AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA_ORIGIN_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('BLOOM_ORIGIN_PATH'), prompt),
     (AutoModel, AutoTokenizer, os.environ.get('ORIGINAL_CHATGLM2_6B_PATH'), prompt),
     (AutoModelForCausalLM, AutoTokenizer, os.environ.get('ORIGINAL_CODESHELL_7B_PATH'), prompt),
-    # (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
+    (AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_ORIGIN_PATH'), prompt)
 ])
 
 def test_optimize_model(Model, Tokenizer, model_path, prompt):
diff --git a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
index 5cec634be44..f45f017ef0b 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_RMSNorm.py
@@ -30,7 +30,7 @@
 TEST_MODEL_LIST = [
     ("Llama2-7B", AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH')),
     ("ChatGLM2-6B", AutoModel, AutoTokenizer, os.environ.get('CHATGLM2_6B_ORIGIN_PATH')),
-    # ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
+    ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
     ("Baichuan2-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('BAICHUAN2_7B_ORIGIN_PATH')),
     ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')),
 ]
diff --git a/python/llm/test/inference_gpu/test_transformers_api_attention.py b/python/llm/test/inference_gpu/test_transformers_api_attention.py
index e5c39897cec..83f7aaebfc8 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_attention.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_attention.py
@@ -32,7 +32,7 @@
     ("Llama2-7B", AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH')),
     ("Falcon-7B", AutoModelForCausalLM, AutoTokenizer, os.environ.get('FALCON_7B_ORIGIN_PATH')),
     ("ChatGLM2-6B", AutoModel, AutoTokenizer, os.environ.get('CHATGLM2_6B_ORIGIN_PATH')),
-    # ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
+    ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
     ("Baichuan2-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('BAICHUAN2_7B_ORIGIN_PATH')),
     ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')),
 ]
diff --git a/python/llm/test/inference_gpu/test_transformers_api_mlp.py b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
index f7aba58fb72..70ba2e7b9f6 100644
--- a/python/llm/test/inference_gpu/test_transformers_api_mlp.py
+++ b/python/llm/test/inference_gpu/test_transformers_api_mlp.py
@@ -28,7 +28,7 @@
 PROMPT = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
 TEST_MODEL_LIST = [
     ("Qwen-7B-Chat", AutoModelForCausalLM, AutoTokenizer, os.environ.get('QWEN_7B_ORIGIN_PATH')),
-    # ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
+    ("Mistral-7B-Instruct-v0.1", AutoModelForCausalLM, AutoTokenizer, os.environ.get('MISTRAL_7B_INSTRUCT_V0_1_ORIGIN_PATH')),
     ("Llama2-7B", AutoModelForCausalLM, LlamaTokenizer, os.environ.get('LLAMA2_7B_ORIGIN_PATH'))
 ]
 

From 8099a2cf97eed5a51d84d0602a09957d50b5fc58 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Sat, 4 May 2024 15:46:02 -0700
Subject: [PATCH 38/57] update

---
 .github/workflows/llm_unit_tests.yml | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index abf86908709..09ebf4f4421 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -155,10 +155,13 @@ jobs:
           #   echo "Directory $CHATGLM_INT4_CKPT_PATH not found. Downloading from FTP server..."
           #   wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/chatglm2-6b-q4_0.bin -P $INT4_CKPT_DIR
           # fi
+          if [ -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
+            rm -rf $ORIGINAL_CHATGLM2_6B_PATH
+          fi
           if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
             echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
             echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"
-            wget -r -nH --no-verbose --cut-dirs=2 $LLM_FTP_URL/llm/updated_for_4.36/chatglm2-6b -P $ORIGIN_DIR
+            wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
           fi
           if [ ! -d $ORIGINAL_CODESHELL_7B_PATH ]; then
             echo "Directory $ORIGINAL_CODESHELL_7B_PATH not found. Downloading from FTP server..."
@@ -313,6 +316,9 @@ jobs:
             echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
           fi
+          if [ -d $CHATGLM2_6B_ORIGIN_PATH ]; then
+            rm -rf $CHATGLM2_6B_ORIGIN_PATH
+          fi
           if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
             echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
@@ -321,6 +327,9 @@ jobs:
             echo "Directory $FALCON_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/falcon-7b-instruct-with-patch -P $ORIGIN_DIR
           fi
+          if [ -d $MPT_7B_ORIGIN_PATH ]; then
+            rm -rf $MPT_7B_ORIGIN_PATH
+          fi
           if [ ! -d $MPT_7B_ORIGIN_PATH ]; then
             echo "Directory $MPT_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/mpt-7b-chat -P $ORIGIN_DIR
@@ -348,6 +357,9 @@ jobs:
             echo "Directory $QWEN_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Qwen-7B-Chat -P $ORIGIN_DIR
           fi
+          if [ -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
+            rm -rf $BAICHUAN2_7B_ORIGIN_PATH
+          fi
           if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
             echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Baichuan2-7B-Chat -P $ORIGIN_DIR

From 270ecb8718c17bac72ef62767cb399cdf7be51b0 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Sat, 4 May 2024 23:34:17 -0700
Subject: [PATCH 39/57] update

---
 .github/workflows/llm_performance_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 337f8361b28..a0411e002d1 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -87,7 +87,7 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           bash python/llm/test/run-llm-install-tests.sh
 
-      - name: Test on xpu(transformers==4.36.2)
+      - name: Test on xpu(transformers==4.36.0)
         shell: bash
         run: |
           date_for_test_version=$(date -d yesterday +%Y-%m-%d)

From bc847bf75797a8755aedc4b5bdfd6ccd04c873ab Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Sun, 5 May 2024 00:27:47 -0700
Subject: [PATCH 40/57] update

---
 .github/workflows/llm-ppl-evaluation.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index bfab5277f80..584d6fcd977 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -143,7 +143,8 @@ jobs:
         run: |
           echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
           MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
-          wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
+          #wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
+          wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
           
       - name: Upgrade packages
         shell: bash

From 0fcaa40170a51e6f34f0549a8a6b9dc045016f7b Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Sun, 5 May 2024 13:58:28 -0700
Subject: [PATCH 41/57] update

---
 .github/workflows/llm-ppl-evaluation.yml |  3 +--
 .github/workflows/llm_unit_tests.yml     | 12 ------------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index 584d6fcd977..bfab5277f80 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -143,8 +143,7 @@ jobs:
         run: |
           echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
           MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
-          #wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
-          wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
+          wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
           
       - name: Upgrade packages
         shell: bash
diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 09ebf4f4421..0590531a372 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -155,9 +155,6 @@ jobs:
           #   echo "Directory $CHATGLM_INT4_CKPT_PATH not found. Downloading from FTP server..."
           #   wget --no-verbose $LLM_FTP_URL/llm/ggml-actions/stable/chatglm2-6b-q4_0.bin -P $INT4_CKPT_DIR
           # fi
-          if [ -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
-            rm -rf $ORIGINAL_CHATGLM2_6B_PATH
-          fi
           if [ ! -d $ORIGINAL_CHATGLM2_6B_PATH ]; then
             echo "Directory $ORIGINAL_CHATGLM2_6B_PATH not found. Downloading from FTP server..."
             echo "wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR"
@@ -316,9 +313,6 @@ jobs:
             echo "Directory $LLAMA2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Llama-2-7b-chat-hf -P $ORIGIN_DIR
           fi
-          if [ -d $CHATGLM2_6B_ORIGIN_PATH ]; then
-            rm -rf $CHATGLM2_6B_ORIGIN_PATH
-          fi
           if [ ! -d $CHATGLM2_6B_ORIGIN_PATH ]; then
             echo "Directory $CHATGLM2_6B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/chatglm2-6b -P $ORIGIN_DIR
@@ -327,9 +321,6 @@ jobs:
             echo "Directory $FALCON_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/falcon-7b-instruct-with-patch -P $ORIGIN_DIR
           fi
-          if [ -d $MPT_7B_ORIGIN_PATH ]; then
-            rm -rf $MPT_7B_ORIGIN_PATH
-          fi
           if [ ! -d $MPT_7B_ORIGIN_PATH ]; then
             echo "Directory $MPT_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/mpt-7b-chat -P $ORIGIN_DIR
@@ -357,9 +348,6 @@ jobs:
             echo "Directory $QWEN_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Qwen-7B-Chat -P $ORIGIN_DIR
           fi
-          if [ -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
-            rm -rf $BAICHUAN2_7B_ORIGIN_PATH
-          fi
           if [ ! -d $BAICHUAN2_7B_ORIGIN_PATH ]; then
             echo "Directory $BAICHUAN2_7B_ORIGIN_PATH not found. Downloading from FTP server..."
             wget -r -nH --no-verbose --cut-dirs=1 $LLM_FTP_URL/llm/Baichuan2-7B-Chat -P $ORIGIN_DIR

From 26aa194da8400226d10cd723cfc0e8f96009a44d Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Mon, 6 May 2024 12:07:20 -0700
Subject: [PATCH 42/57] update

---
 .github/workflows/llm-c-evaluation.yml        |  2 +-
 .github/workflows/llm-whisper-evaluation.yml  |  2 +-
 .github/workflows/llm_performance_tests.yml   | 82 +++++++++----------
 .../llm_tests_for_stable_version_on_arc.yml   | 12 +--
 .../llm_tests_for_stable_version_on_spr.yml   | 12 +--
 5 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/llm-c-evaluation.yml b/.github/workflows/llm-c-evaluation.yml
index 9ca18276c75..464579fcc26 100644
--- a/.github/workflows/llm-c-evaluation.yml
+++ b/.github/workflows/llm-c-evaluation.yml
@@ -16,7 +16,7 @@ on:
     branches: [main]
     paths:
       - ".github/workflows/llm-c-evaluation.yml"
-  # Allows you to run this workflow manually from the Actions tab
+  ## Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
     inputs:
       model_name:
diff --git a/.github/workflows/llm-whisper-evaluation.yml b/.github/workflows/llm-whisper-evaluation.yml
index e60eadbf1df..c26d66e726f 100644
--- a/.github/workflows/llm-whisper-evaluation.yml
+++ b/.github/workflows/llm-whisper-evaluation.yml
@@ -75,7 +75,7 @@ jobs:
             echo "runner=$runner" >> $GITHUB_OUTPUT
 
   llm-whisper-evaluation:
-    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-whisper-evaluation' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    #if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-whisper-evaluation' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
     needs: [llm-cpp-build, set-matrix] # please uncomment it for PR tests
     # needs: [set-matrix] # please comment it for PR tests
     strategy:
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index a0411e002d1..2b5706f20a9 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -338,45 +338,16 @@ jobs:
       # TODO: Put the ipex-llm related install process for win gpu into a action function
 
       # Please uncomment it and commment the install from pypi for PR tests
-      #- name: Download llm binary
-      #  uses: ./.github/actions/llm/download-llm-binary
-
-      #- name: Prepare for install ipex-llm from source
-      #  shell: bash
-      #  run: |
-      #    sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
-      #    sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
-
-      #- name: Install ipex-llm and other related packages (install from source)
-      #  shell: cmd
-      #  run: |
-      #    call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
-      #    call conda activate igpu-perf
-
-      #    pip install --upgrade pip
-      #    pip install --upgrade wheel
-      #    pip install --upgrade omegaconf pandas
-      #    pip install --upgrade tiktoken einops transformers_stream_generator
-
-      #    cd python\llm
-      #    python setup.py clean --all bdist_wheel --win
-      #    if not exist dist\ipex_llm*.whl (exit /b 1)
-      #    for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
-
-      #    pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-      #    if %ERRORLEVEL% neq 0 (exit /b 1)
-      #    pip install transformers==4.36.2
-      #    pip list
-
-      #    call conda deactivate
+      - name: Download llm binary
+        uses: ./.github/actions/llm/download-llm-binary
 
-      - name: Determine desired ipex-llm version
+      - name: Prepare for install ipex-llm from source
         shell: bash
         run: |
-          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-          echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
+          sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
+          sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
 
-      - name: Install ipex-llm and other related packages (install from pypi)
+      - name: Install ipex-llm and other related packages (install from source)
         shell: cmd
         run: |
           call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
@@ -387,17 +358,46 @@ jobs:
           pip install --upgrade omegaconf pandas
           pip install --upgrade tiktoken einops transformers_stream_generator
 
-          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          pip show ipex-llm | findstr %TEST_VERSION_DATE%
-          if %ERRORLEVEL% neq 0 (
-            echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
-            exit /b 1
-          )
+          cd python\llm
+          python setup.py clean --all bdist_wheel --win
+          if not exist dist\ipex_llm*.whl (exit /b 1)
+          for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
+
+          pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+          if %ERRORLEVEL% neq 0 (exit /b 1)
           pip install transformers==4.36.2
           pip list
 
           call conda deactivate
 
+      #- name: Determine desired ipex-llm version
+      #  shell: bash
+      #  run: |
+      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+      #    echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
+
+      #- name: Install ipex-llm and other related packages (install from pypi)
+      #  shell: cmd
+      #  run: |
+      #    call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
+      #    call conda activate igpu-perf
+
+      #    pip install --upgrade pip
+      #    pip install --upgrade wheel
+      #    pip install --upgrade omegaconf pandas
+      #    pip install --upgrade tiktoken einops transformers_stream_generator
+
+      #    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #    pip show ipex-llm | findstr %TEST_VERSION_DATE%
+      #    if %ERRORLEVEL% neq 0 (
+      #      echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
+      #      exit /b 1
+      #    )
+      #    pip install transformers==4.36.2
+      #    pip list
+
+      #    call conda deactivate
+
       - name: Create env for html generation
         shell: cmd
         run: |
diff --git a/.github/workflows/llm_tests_for_stable_version_on_arc.yml b/.github/workflows/llm_tests_for_stable_version_on_arc.yml
index 1b8c48d972d..297236af77d 100644
--- a/.github/workflows/llm_tests_for_stable_version_on_arc.yml
+++ b/.github/workflows/llm_tests_for_stable_version_on_arc.yml
@@ -10,12 +10,12 @@ permissions:
 
 # Controls when the action will run.
 on:
-  # pull_request:
-  #   branches: [main]
-  #   paths:
-  #     - ".github/workflows/llm_performance_tests.yml"
-  #     - "python/llm/test/benchmark/**"
-  #     - "python/llm/dev/benchmark/all-in-one/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm_performance_tests.yml"
+      - "python/llm/test/benchmark/**"
+      - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
diff --git a/.github/workflows/llm_tests_for_stable_version_on_spr.yml b/.github/workflows/llm_tests_for_stable_version_on_spr.yml
index d852499c57b..8a18984cf5a 100644
--- a/.github/workflows/llm_tests_for_stable_version_on_spr.yml
+++ b/.github/workflows/llm_tests_for_stable_version_on_spr.yml
@@ -10,12 +10,12 @@ permissions:
 
 # Controls when the action will run.
 on:
-  # pull_request:
-  #   branches: [main]
-  #   paths:
-  #     - ".github/workflows/llm_performance_tests.yml"
-  #     - "python/llm/test/benchmark/**"
-  #     - "python/llm/dev/benchmark/all-in-one/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm_performance_tests.yml"
+      - "python/llm/test/benchmark/**"
+      - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 

From 22d0bf64007ec003f7a22dd3c795709ce2b3c638 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Mon, 6 May 2024 15:24:26 -0700
Subject: [PATCH 43/57] update

---
 .../stable-version-arc-perf-test-fp8-436.yaml | 27 +++++++++++++++++++
 .../stable-version-arc-perf-test-fp8.yaml     | 22 +++++++--------
 ...le-version-arc-perf-test-sym_int4-436.yaml | 27 +++++++++++++++++++
 ...stable-version-arc-perf-test-sym_int4.yaml | 22 +++++++--------
 ...table-version-arc-stress-test-fp8-436.yaml | 18 +++++++++++++
 .../stable-version-arc-stress-test-fp8.yaml   |  8 +++---
 ...-version-arc-stress-test-sym_int4-436.yaml | 16 +++++++++++
 ...able-version-arc-stress-test-sym_int4.yaml |  6 ++---
 .../stable-version-cpu-perf-test-436.yaml     | 26 ++++++++++++++++++
 .../stable-version-cpu-perf-test.yaml         | 10 +++----
 .../stable-version-cpu-stress-test.yaml       | 10 +++----
 11 files changed, 153 insertions(+), 39 deletions(-)
 create mode 100644 python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml
 create mode 100644 python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml
 create mode 100644 python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml
 create mode 100644 python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml
 create mode 100644 python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml

diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml
new file mode 100644
index 00000000000..0a0795da086
--- /dev/null
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml
@@ -0,0 +1,27 @@
+repo_id:
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+local_model_hub: '/mnt/disk0/models'
+warm_up: 1
+num_trials: 3
+num_beams: 1 # default to greedy search
+low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: 1 # default to 1
+in_out_pairs:
+  - '32-32'
+  - '512-256'
+  - '1024-128'
+  - '2048-256'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'THUDM/chatglm2-6b:2048:8'
+  - 'THUDM/chatglm3-6b:2048:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
index 00884dbe21d..87153f83dcf 100644
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm3-6b'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
@@ -23,14 +23,14 @@ exclude:
   - 'meta-llama/Llama-2-7b-chat-hf:512:8'
   - 'meta-llama/Llama-2-7b-chat-hf:1024:8'
   - 'meta-llama/Llama-2-7b-chat-hf:2048:8'
-  - 'THUDM/chatglm2-6b:2048:8'
-  - 'THUDM/chatglm3-6b:2048:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
+#  - 'THUDM/chatglm2-6b:2048:8'
+#  - 'THUDM/chatglm3-6b:2048:8'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
   - 'Qwen/Qwen-7B-Chat:2048:1'
   - 'Qwen/Qwen-7B-Chat:1024:2'
   - 'Qwen/Qwen-7B-Chat:2048:2'
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml
new file mode 100644
index 00000000000..549b46625fc
--- /dev/null
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml
@@ -0,0 +1,27 @@
+repo_id:
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+local_model_hub: '/mnt/disk0/models'
+warm_up: 1
+num_trials: 3
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: 1 # default to 1
+in_out_pairs:
+  - '32-32'
+  - '512-256'
+  - '1024-128'
+  - '2048-256'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'THUDM/chatglm2-6b:2048:8'
+  - 'THUDM/chatglm3-6b:2048:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
index cb9f7b30e9c..d31f892fe17 100644
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm3-6b'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
@@ -22,14 +22,14 @@ exclude:
   - 'meta-llama/Llama-2-7b-chat-hf:2048:4'
   - 'meta-llama/Llama-2-7b-chat-hf:1024:8'
   - 'meta-llama/Llama-2-7b-chat-hf:2048:8'
-  - 'THUDM/chatglm2-6b:2048:8'
-  - 'THUDM/chatglm3-6b:2048:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
+#  - 'THUDM/chatglm2-6b:2048:8'
+#  - 'THUDM/chatglm3-6b:2048:8'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
   - 'Qwen/Qwen-7B-Chat:2048:2'
   - 'Qwen/Qwen-7B-Chat:1024:4'
   - 'Qwen/Qwen-7B-Chat:2048:4'
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml
new file mode 100644
index 00000000000..e887328e435
--- /dev/null
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml
@@ -0,0 +1,18 @@
+repo_id:
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+local_model_hub: '/mnt/disk0/models'
+warm_up: 10
+num_trials: 100
+num_beams: 1 # default to greedy search
+low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: 1 # default to 1
+in_out_pairs:
+  - '1024-512'
+  - '2048-512'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
+exclude:
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048'
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
index bc64ad92305..7e2f14042ef 100644
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm3-6b'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 10
@@ -17,5 +17,5 @@ test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048'
+#  - 'baichuan-inc/Baichuan2-7B-Chat:2048'
   - 'Qwen/Qwen-7B-Chat:2048'
\ No newline at end of file
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml
new file mode 100644
index 00000000000..ef0b6324eb5
--- /dev/null
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml
@@ -0,0 +1,16 @@
+repo_id:
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  local_model_hub: '/mnt/disk0/models'
+warm_up: 10
+num_trials: 100
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: 1 # default to 1
+in_out_pairs:
+  - '1024-512'
+  - '2048-512'
+test_api:
+  - "transformer_int4_gpu"  # on Intel GPU
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
index 119a67e32fa..405c0010a05 100644
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm3-6b'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 10
diff --git a/python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml b/python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml
new file mode 100644
index 00000000000..c7ba0c1ded8
--- /dev/null
+++ b/python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml
@@ -0,0 +1,26 @@
+repo_id:
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat'
+local_model_hub: '/mnt/disk1/models/updated_for_4.36'
+warm_up: 1
+num_trials: 3
+num_beams: 1 # default to greedy search
+low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
+batch_size: 1 # default to 1
+in_out_pairs:
+  - '32-32'
+  - '1024-128'
+  - '2048-256'
+test_api:
+  - "transformer_int4"
+  # - "native_int4"
+  # - "optimize_model"
+  # - "pytorch_autocast_bf16"
+  # - "ipex_fp16_gpu" # on Intel GPU
+  # - "transformer_int4_gpu"  # on Intel GPU
+  # - "optimize_model_gpu"  # on Intel GPU
+  # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
+  # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
+cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
index aa9158bdd13..6095e2ed805 100644
--- a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
@@ -1,12 +1,12 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'baichuan-inc/Baichuan2-13B-Chat'
+#  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm3-6b'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
-local_model_hub: '/models'
+local_model_hub: '/mnt/disk1/models'
 warm_up: 1
 num_trials: 3
 num_beams: 1 # default to greedy search
diff --git a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
index 38aeb375910..ee17bbaf53a 100644
--- a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
@@ -1,12 +1,12 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'baichuan-inc/Baichuan2-13B-Chat'
+#  - 'THUDM/chatglm2-6b'
+#  - 'THUDM/chatglm3-6b'
+#  - 'baichuan-inc/Baichuan2-7B-Chat'
+#  - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
-local_model_hub: '/models'
+local_model_hub: '/mnt/disk1/models'
 warm_up: 3
 num_trials: 50
 num_beams: 1 # default to greedy search

From 65ea875221e27c7f3816c00a24a6e6766c4cbc9f Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 7 May 2024 15:18:51 -0700
Subject: [PATCH 44/57] update

---
 .github/workflows/llm-ppl-evaluation.yml      |  5 ++--
 .github/workflows/llm_performance_tests.yml   | 23 ++-----------------
 python/llm/test/benchmark/arc-perf-test.yaml  | 12 +++++-----
 python/llm/test/benchmark/cpu-perf-test.yaml  |  8 +++----
 .../stable-version-arc-perf-test-fp8.yaml     |  6 ++---
 ...stable-version-arc-perf-test-sym_int4.yaml |  6 ++---
 .../stable-version-arc-stress-test-fp8.yaml   |  6 ++---
 ...able-version-arc-stress-test-sym_int4.yaml |  6 ++---
 .../stable-version-cpu-perf-test.yaml         |  8 +++----
 .../stable-version-cpu-stress-test.yaml       |  8 +++----
 10 files changed, 35 insertions(+), 53 deletions(-)

diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index bfab5277f80..ceb53f8b378 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -71,7 +71,7 @@ jobs:
         if: ${{github.event_name == 'pull_request'}}
         env:
           PR_MATRIX_SEQ_LEN: '["512"]'
-          PR_MATRIX_MODEL_NAME: '["Llama-2-7b-chat-hf", "chatglm3-6b"]'
+          PR_MATRIX_MODEL_NAME: '["Llama-2-7b-chat-hf", "chatglm3-6b", "chatglm3-6b", "Baichuan2-7B-Chat","mpt-7b-chat"]'
           PR_MATRIX_PRECISION: '["sym_int4"]'
           PR_LABELS: '["self-hosted", "llm", "temp-arc01"]'
         run: |
@@ -143,7 +143,8 @@ jobs:
         run: |
           echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
           MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
-          wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
+          #wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
+          wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
           
       - name: Upgrade packages
         shell: bash
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 2b5706f20a9..55a6bcb67d6 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -102,13 +102,6 @@ jobs:
           # change csv name
           sed -i 's/{today}/{today}_test1/g' run.py
           python run.py
-          # run updated models for 4.36
-          cd -
-          cp python/llm/test/benchmark/arc-perf-test-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml
-          cd python/llm/dev/benchmark/all-in-one
-          # change csv name
-          sed -i 's/test1/test2/g' run.py
-          python run.py
 
       #- name: Test on xpu(transformers==4.34.0)
       #  shell: bash
@@ -135,7 +128,7 @@ jobs:
           cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
           # change csv name
-          sed -i 's/test2/test3/g' run.py
+          sed -i 's/test1/test2/g' run.py
           python run.py
 
       - name: Concat csv and generate html
@@ -157,8 +150,7 @@ jobs:
         run: |
           cd python/llm/dev/benchmark/all-in-one
           python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
-          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-test-436.yaml
-          python ../../../test/benchmark/check_results.py -c test3 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
+          python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
           # python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml
           find . -name "*test*.csv" -delete
           if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
@@ -227,18 +219,7 @@ jobs:
           export OMP_NUM_THREADS=48
           # hide time info
           sed -i 's/str(end - st)/"xxxxxx"/g' run.py
-          # change csv name
-          sed -i 's/{today}/{today}_test1/g' run.py
           python run.py
-          # run updated models for 4.36
-          cd -
-          cp python/llm/test/benchmark/cpu-perf-test-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml
-          cd python/llm/dev/benchmark/all-in-one
-          # change csv name
-          sed -i 's/test1/test2/g' run.py
-          python run.py
-          python ../../../test/benchmark/concat_csv.py
-          find . -name "*test*.csv" -delete
           cp ./*.csv /mnt/disk1/models/nightly_perf_cpu
           cd ../../../test/benchmark
           python -m pip install pandas==1.5.3
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 7552b7f84f1..4f1d6159dc9 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -1,22 +1,22 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm2-6b'
   - 'THUDM/chatglm3-6b-4bit'
   - 'tiiuae/falcon-7b-instruct-with-patch'
-#  - 'mosaicml/mpt-7b-chat'
+  - 'mosaicml/mpt-7b-chat'
   - 'redpajama/gptneox-7b-redpajama-bf16'
   - 'bigcode/starcoder-15.5b-4bit'
   - 'databricks/dolly-v1-6b'
   - 'databricks/dolly-v2-7b'
   - 'databricks/dolly-v2-12b'
-#  - 'internlm/internlm-chat-7b'
+  - 'internlm/internlm-chat-7b'
   - 'Qwen/Qwen-7B-Chat'
   - 'BAAI/AquilaChat-7B'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
-#  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
   - 'bigscience/bloomz-7b1'
-#  - 'fnlp/moss-moon-003-sft-4bit'
+#  - 'fnlp/moss-moon-003-sft-4bit' # moss-moon-003-sft cannot work on transformers 4.34+
   - 'mistralai/Mistral-7B-v0.1'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
diff --git a/python/llm/test/benchmark/cpu-perf-test.yaml b/python/llm/test/benchmark/cpu-perf-test.yaml
index 6095e2ed805..92b12750dbb 100644
--- a/python/llm/test/benchmark/cpu-perf-test.yaml
+++ b/python/llm/test/benchmark/cpu-perf-test.yaml
@@ -1,10 +1,10 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
-#  - 'baichuan-inc/Baichuan2-13B-Chat'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
index 87153f83dcf..0cd4a9b2fe3 100644
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
index d31f892fe17..b1a2b2536c0 100644
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
index 7e2f14042ef..80a67d71e3f 100644
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 10
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
index 405c0010a05..119a67e32fa 100644
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4.yaml
@@ -1,8 +1,8 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
   - 'Qwen/Qwen-7B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 10
diff --git a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
index 6095e2ed805..92b12750dbb 100644
--- a/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-perf-test.yaml
@@ -1,10 +1,10 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
-#  - 'baichuan-inc/Baichuan2-13B-Chat'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 1
diff --git a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
index ee17bbaf53a..f8c75489659 100644
--- a/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
+++ b/python/llm/test/benchmark/stable-version-cpu-stress-test.yaml
@@ -1,10 +1,10 @@
 repo_id:
   - 'meta-llama/Llama-2-7b-chat-hf'
   - 'meta-llama/Llama-2-13b-chat-hf'
-#  - 'THUDM/chatglm2-6b'
-#  - 'THUDM/chatglm3-6b'
-#  - 'baichuan-inc/Baichuan2-7B-Chat'
-#  - 'baichuan-inc/Baichuan2-13B-Chat'
+  - 'THUDM/chatglm2-6b'
+  - 'THUDM/chatglm3-6b'
+  - 'baichuan-inc/Baichuan2-7B-Chat'
+  - 'baichuan-inc/Baichuan2-13B-Chat'
   - 'Qwen/Qwen-14B-Chat'
 local_model_hub: '/mnt/disk1/models'
 warm_up: 3

From 4f98a3887b25831ddd3870c67c492ae91b52e252 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 7 May 2024 15:27:18 -0700
Subject: [PATCH 45/57] update

---
 .../stable-version-arc-perf-test-fp8.yaml        | 16 ++++++++--------
 .../stable-version-arc-perf-test-sym_int4.yaml   | 16 ++++++++--------
 .../stable-version-arc-stress-test-fp8.yaml      |  2 +-
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
index 0cd4a9b2fe3..00884dbe21d 100644
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8.yaml
@@ -23,14 +23,14 @@ exclude:
   - 'meta-llama/Llama-2-7b-chat-hf:512:8'
   - 'meta-llama/Llama-2-7b-chat-hf:1024:8'
   - 'meta-llama/Llama-2-7b-chat-hf:2048:8'
-#  - 'THUDM/chatglm2-6b:2048:8'
-#  - 'THUDM/chatglm3-6b:2048:8'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
+  - 'THUDM/chatglm2-6b:2048:8'
+  - 'THUDM/chatglm3-6b:2048:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
   - 'Qwen/Qwen-7B-Chat:2048:1'
   - 'Qwen/Qwen-7B-Chat:1024:2'
   - 'Qwen/Qwen-7B-Chat:2048:2'
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
index b1a2b2536c0..cb9f7b30e9c 100644
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4.yaml
@@ -22,14 +22,14 @@ exclude:
   - 'meta-llama/Llama-2-7b-chat-hf:2048:4'
   - 'meta-llama/Llama-2-7b-chat-hf:1024:8'
   - 'meta-llama/Llama-2-7b-chat-hf:2048:8'
-#  - 'THUDM/chatglm2-6b:2048:8'
-#  - 'THUDM/chatglm3-6b:2048:8'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
-#  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
+  - 'THUDM/chatglm2-6b:2048:8'
+  - 'THUDM/chatglm3-6b:2048:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
+  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
   - 'Qwen/Qwen-7B-Chat:2048:2'
   - 'Qwen/Qwen-7B-Chat:1024:4'
   - 'Qwen/Qwen-7B-Chat:2048:4'
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
index 80a67d71e3f..bc64ad92305 100644
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
+++ b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8.yaml
@@ -17,5 +17,5 @@ test_api:
   - "transformer_int4_gpu"  # on Intel GPU
 cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
 exclude:
-#  - 'baichuan-inc/Baichuan2-7B-Chat:2048'
+  - 'baichuan-inc/Baichuan2-7B-Chat:2048'
   - 'Qwen/Qwen-7B-Chat:2048'
\ No newline at end of file

From c64ec3389dd2e8f86b06d7b544b7dfaad49dddc9 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 7 May 2024 16:45:38 -0700
Subject: [PATCH 46/57] update

---
 .github/workflows/llm-ppl-evaluation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index ceb53f8b378..faf24ea3607 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -71,7 +71,7 @@ jobs:
         if: ${{github.event_name == 'pull_request'}}
         env:
           PR_MATRIX_SEQ_LEN: '["512"]'
-          PR_MATRIX_MODEL_NAME: '["Llama-2-7b-chat-hf", "chatglm3-6b", "chatglm3-6b", "Baichuan2-7B-Chat","mpt-7b-chat"]'
+          PR_MATRIX_MODEL_NAME: '["Llama-2-7b-chat-hf", "chatglm3-6b", "chatglm2-6b", "Baichuan2-7B-Chat","mpt-7b-chat"]'
           PR_MATRIX_PRECISION: '["sym_int4"]'
           PR_LABELS: '["self-hosted", "llm", "temp-arc01"]'
         run: |

From 9c9e92d506053a106a5026c8af2a280bc73e0e4c Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 7 May 2024 22:48:09 -0700
Subject: [PATCH 47/57] update

---
 python/llm/test/benchmark/arc-perf-test.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index 4f1d6159dc9..df4b1465625 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -35,3 +35,4 @@ exclude:
 #  - 'fnlp/moss-moon-003-sft-4bit:1024'
 #  - 'fnlp/moss-moon-003-sft-4bit:2048'
   - 'bigscience/bloomz-7b1:2048'
+  - 'mosaicml/mpt-7b-chat:2048'

From 4b04c45b38733f3d98e3ce7e0084f650f13143b6 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 8 May 2024 11:34:29 -0700
Subject: [PATCH 48/57] update

---
 .github/workflows/llm-ppl-evaluation.yml      |  7 +++--
 .../llm/test/benchmark/arc-perf-test-436.yaml | 22 ---------------
 python/llm/test/benchmark/arc-perf-test.yaml  |  2 +-
 .../llm/test/benchmark/cpu-perf-test-436.yaml | 26 ------------------
 .../stable-version-arc-perf-test-fp8-436.yaml | 27 -------------------
 ...le-version-arc-perf-test-sym_int4-436.yaml | 27 -------------------
 ...table-version-arc-stress-test-fp8-436.yaml | 18 -------------
 ...-version-arc-stress-test-sym_int4-436.yaml | 16 -----------
 .../stable-version-cpu-perf-test-436.yaml     | 26 ------------------
 9 files changed, 4 insertions(+), 167 deletions(-)
 delete mode 100644 python/llm/test/benchmark/arc-perf-test-436.yaml
 delete mode 100644 python/llm/test/benchmark/cpu-perf-test-436.yaml
 delete mode 100644 python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml
 delete mode 100644 python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml
 delete mode 100644 python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml
 delete mode 100644 python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml
 delete mode 100644 python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml

diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index faf24ea3607..4efc6f2f795 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -71,7 +71,7 @@ jobs:
         if: ${{github.event_name == 'pull_request'}}
         env:
           PR_MATRIX_SEQ_LEN: '["512"]'
-          PR_MATRIX_MODEL_NAME: '["Llama-2-7b-chat-hf", "chatglm3-6b", "chatglm2-6b", "Baichuan2-7B-Chat","mpt-7b-chat"]'
+          PR_MATRIX_MODEL_NAME: '["Llama-2-7b-chat-hf", "chatglm3-6b"]'
           PR_MATRIX_PRECISION: '["sym_int4"]'
           PR_LABELS: '["self-hosted", "llm", "temp-arc01"]'
         run: |
@@ -143,9 +143,8 @@ jobs:
         run: |
           echo "MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/" >> "$GITHUB_ENV"
           MODEL_PATH=${ORIGIN_DIR}/${{ matrix.model_name }}/
-          #wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
-          wget -r -nH --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
-          
+          wget -r -nH -nc --no-verbose --cut-dirs=1 ${LLM_FTP_URL}/llm/${{ matrix.model_name }} -P ${ORIGIN_DIR}
+
       - name: Upgrade packages
         shell: bash
         run: |
diff --git a/python/llm/test/benchmark/arc-perf-test-436.yaml b/python/llm/test/benchmark/arc-perf-test-436.yaml
deleted file mode 100644
index f619987bfc9..00000000000
--- a/python/llm/test/benchmark/arc-perf-test-436.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-repo_id:
-  - 'THUDM/chatglm2-6b'
-  - 'mosaicml/mpt-7b-chat'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'baichuan-inc/Baichuan2-13B-Chat-4bit'
-  - 'internlm/internlm-chat-7b'
-local_model_hub: '/mnt/disk0/models'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-  - '1024-128'
-  - '2048-256'
-test_api:
-  - "transformer_int4_gpu"  # on Intel GPU
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
-exclude:
-  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
-  - 'mosaicml/mpt-7b-chat:2048'
diff --git a/python/llm/test/benchmark/arc-perf-test.yaml b/python/llm/test/benchmark/arc-perf-test.yaml
index df4b1465625..895588ce4e4 100644
--- a/python/llm/test/benchmark/arc-perf-test.yaml
+++ b/python/llm/test/benchmark/arc-perf-test.yaml
@@ -34,5 +34,5 @@ cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu w
 exclude:
 #  - 'fnlp/moss-moon-003-sft-4bit:1024'
 #  - 'fnlp/moss-moon-003-sft-4bit:2048'
+  - 'baichuan-inc/Baichuan2-13B-Chat-4bit:2048'
   - 'bigscience/bloomz-7b1:2048'
-  - 'mosaicml/mpt-7b-chat:2048'
diff --git a/python/llm/test/benchmark/cpu-perf-test-436.yaml b/python/llm/test/benchmark/cpu-perf-test-436.yaml
deleted file mode 100644
index c7ba0c1ded8..00000000000
--- a/python/llm/test/benchmark/cpu-perf-test-436.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-repo_id:
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'baichuan-inc/Baichuan2-13B-Chat'
-local_model_hub: '/mnt/disk1/models/updated_for_4.36'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-  - '1024-128'
-  - '2048-256'
-test_api:
-  - "transformer_int4"
-  # - "native_int4"
-  # - "optimize_model"
-  # - "pytorch_autocast_bf16"
-  # - "ipex_fp16_gpu" # on Intel GPU
-  # - "transformer_int4_gpu"  # on Intel GPU
-  # - "optimize_model_gpu"  # on Intel GPU
-  # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
-  # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml
deleted file mode 100644
index 0a0795da086..00000000000
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-fp8-436.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-repo_id:
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-local_model_hub: '/mnt/disk0/models'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-  - '512-256'
-  - '1024-128'
-  - '2048-256'
-test_api:
-  - "transformer_int4_gpu"  # on Intel GPU
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
-exclude:
-  - 'THUDM/chatglm2-6b:2048:8'
-  - 'THUDM/chatglm3-6b:2048:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
diff --git a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml b/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml
deleted file mode 100644
index 549b46625fc..00000000000
--- a/python/llm/test/benchmark/stable-version-arc-perf-test-sym_int4-436.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-repo_id:
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-local_model_hub: '/mnt/disk0/models'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-  - '512-256'
-  - '1024-128'
-  - '2048-256'
-test_api:
-  - "transformer_int4_gpu"  # on Intel GPU
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
-exclude:
-  - 'THUDM/chatglm2-6b:2048:8'
-  - 'THUDM/chatglm3-6b:2048:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:2'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:4'
-  - 'baichuan-inc/Baichuan2-7B-Chat:512:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:1024:8'
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048:8'
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml
deleted file mode 100644
index e887328e435..00000000000
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-fp8-436.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-repo_id:
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-local_model_hub: '/mnt/disk0/models'
-warm_up: 10
-num_trials: 100
-num_beams: 1 # default to greedy search
-low_bit: 'fp8' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-512'
-  - '2048-512'
-test_api:
-  - "transformer_int4_gpu"  # on Intel GPU
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
-exclude:
-  - 'baichuan-inc/Baichuan2-7B-Chat:2048'
diff --git a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml b/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml
deleted file mode 100644
index ef0b6324eb5..00000000000
--- a/python/llm/test/benchmark/stable-version-arc-stress-test-sym_int4-436.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-repo_id:
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  local_model_hub: '/mnt/disk0/models'
-warm_up: 10
-num_trials: 100
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-512'
-  - '2048-512'
-test_api:
-  - "transformer_int4_gpu"  # on Intel GPU
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml b/python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml
deleted file mode 100644
index c7ba0c1ded8..00000000000
--- a/python/llm/test/benchmark/stable-version-cpu-perf-test-436.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-repo_id:
-  - 'THUDM/chatglm2-6b'
-  - 'THUDM/chatglm3-6b'
-  - 'baichuan-inc/Baichuan2-7B-Chat'
-  - 'baichuan-inc/Baichuan2-13B-Chat'
-local_model_hub: '/mnt/disk1/models/updated_for_4.36'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-  - '1024-128'
-  - '2048-256'
-test_api:
-  - "transformer_int4"
-  # - "native_int4"
-  # - "optimize_model"
-  # - "pytorch_autocast_bf16"
-  # - "ipex_fp16_gpu" # on Intel GPU
-  # - "transformer_int4_gpu"  # on Intel GPU
-  # - "optimize_model_gpu"  # on Intel GPU
-  # - "deepspeed_transformer_int4_cpu" # on Intel SPR Server
-  # - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)

From a533ae8fcf937f04ee8d90cbf76c557cd8585857 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 8 May 2024 11:58:30 -0700
Subject: [PATCH 49/57] delete

---
 .github/workflows/llm-c-evaluation.yml        |   2 +-
 .github/workflows/llm-harness-evaluation.yml  |   6 -
 .github/workflows/llm-ppl-evaluation.yml      |   7 +-
 .github/workflows/llm_performance_tests.yml   | 137 +-----------------
 .github/workflows/llm_unit_tests.yml          |   4 +-
 .../benchmark/arc-perf-transformers-434.yaml  |  16 --
 .../test/run-llm-inference-tests-gpu-434.sh   |  30 ----
 7 files changed, 4 insertions(+), 198 deletions(-)
 delete mode 100644 python/llm/test/benchmark/arc-perf-transformers-434.yaml
 delete mode 100644 python/llm/test/run-llm-inference-tests-gpu-434.sh

diff --git a/.github/workflows/llm-c-evaluation.yml b/.github/workflows/llm-c-evaluation.yml
index 464579fcc26..9ca18276c75 100644
--- a/.github/workflows/llm-c-evaluation.yml
+++ b/.github/workflows/llm-c-evaluation.yml
@@ -16,7 +16,7 @@ on:
     branches: [main]
     paths:
       - ".github/workflows/llm-c-evaluation.yml"
-  ## Allows you to run this workflow manually from the Actions tab
+  # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
     inputs:
       model_name:
diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
index 29146ca116e..e3e1993a9c0 100644
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@@ -164,12 +164,6 @@ jobs:
         shell: bash
         run: |
           pip install --upgrade datasets==2.14.6 
-          #if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
-          #  pip install --upgrade transformers==4.36
-          #else
-          #  pip install --upgrade transformers==4.31
-          #fi
-      
 
       - name: Run harness
         shell: bash
diff --git a/.github/workflows/llm-ppl-evaluation.yml b/.github/workflows/llm-ppl-evaluation.yml
index 4efc6f2f795..7c2037ff318 100644
--- a/.github/workflows/llm-ppl-evaluation.yml
+++ b/.github/workflows/llm-ppl-evaluation.yml
@@ -148,12 +148,7 @@ jobs:
       - name: Upgrade packages
         shell: bash
         run: |
-          pip install --upgrade datasets==2.14.6 
-          #if [ "${{ matrix.model_name }}" = "Mistral-7B-v0.1" ]; then
-          #  pip install --upgrade transformers==4.36
-          #else
-          #  pip install --upgrade transformers==4.31
-          #fi
+          pip install --upgrade datasets==2.14.6
 
       - name: Run perplexity
         shell: bash
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 55a6bcb67d6..3bb6ab6950b 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -87,7 +87,7 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           bash python/llm/test/run-llm-install-tests.sh
 
-      - name: Test on xpu(transformers==4.36.0)
+      - name: Test on xpu(transformers==4.36.2)
         shell: bash
         run: |
           date_for_test_version=$(date -d yesterday +%Y-%m-%d)
@@ -103,20 +103,6 @@ jobs:
           sed -i 's/{today}/{today}_test1/g' run.py
           python run.py
 
-      #- name: Test on xpu(transformers==4.34.0)
-      #  shell: bash
-       # run: |
-      #   source /opt/intel/oneapi/setvars.sh
-      #    export USE_XETLA=OFF
-      #    export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-          # upgrade transformers for model Mistral-7B-v0.1
-      #    python -m pip install transformers==4.34.0
-      #    cp python/llm/test/benchmark/arc-perf-transformers-434.yaml python/llm/dev/benchmark/all-in-one/config.yaml
-      #    cd python/llm/dev/benchmark/all-in-one
-          # change csv name
-      #    sed -i 's/test1/test2/g' run.py
-      #    python run.py
-
       - name: Test on xpu(transformers==4.37.0)
         shell: bash
         run: |
@@ -151,7 +137,6 @@ jobs:
           cd python/llm/dev/benchmark/all-in-one
           python ../../../test/benchmark/check_results.py -c test1 -y ../../../test/benchmark/arc-perf-test.yaml
           python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-437.yaml
-          # python ../../../test/benchmark/check_results.py -c test2 -y ../../../test/benchmark/arc-perf-transformers-434.yaml
           find . -name "*test*.csv" -delete
           if [ ${{ github.event_name }} == "schedule" ] || [ ${{ github.event_name }} == "workflow_dispatch" ]; then
             curl -T ./*.csv ${LLM_FTP_URL}/llm/nightly_perf/gpu/
@@ -429,30 +414,6 @@ jobs:
 
           call conda deactivate
 
-      #- name: Prepare igpu perf test for Mistral (32-32)
-      #  shell: bash
-      #  run: |
-      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_434.yaml
-
-      #- name: Test on igpu for Mistral (32-32)
-      #  shell: cmd
-      #  run: |
-      #    call conda activate igpu-perf
-      #    pip install transformers==4.34.0
-
-      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-      #    set SYCL_CACHE_PERSISTENT=1
-      #    set BIGDL_LLM_XMX_DISABLED=1
-
-      #    cd python\llm\dev\benchmark\all-in-one
-      #    move ..\..\..\test\benchmark\igpu-perf\32-32_434.yaml config.yaml
-      #    set PYTHONIOENCODING=utf-8
-      #    python run.py >> %CSV_SAVE_PATH%\32-32\log\%LOG_FILE% 2>&1
-      #    if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #    call conda deactivate
-
       - name: Prepare igpu perf test for Qwen1.5 (32-32)
         shell: bash
         run: |
@@ -523,30 +484,6 @@ jobs:
 
           call conda deactivate
 
-      #- name: Prepare igpu perf test for Mistral (1024-128)
-      #  shell: bash
-      #  run: |
-      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
-
-      #- name: Test on igpu for Mistral (1024-128)
-      #  shell: cmd
-      #  run: |
-      #    call conda activate igpu-perf
-      #    pip install transformers==4.34.0
-
-      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-      #    set SYCL_CACHE_PERSISTENT=1
-      #    set BIGDL_LLM_XMX_DISABLED=1
-
-      #    cd python\llm\dev\benchmark\all-in-one
-      #    move ..\..\..\test\benchmark\igpu-perf\1024-128_434.yaml config.yaml
-      #    set PYTHONIOENCODING=utf-8
-      #    python run.py >> %CSV_SAVE_PATH%\1024-128\log\%LOG_FILE% 2>&1
-      #    if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #    call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (1024-128)
         shell: bash
         run: |
@@ -616,30 +553,6 @@ jobs:
 
           call conda deactivate
 
-      #- name: Prepare igpu perf test for Mistral (2048-256)
-      #  shell: bash
-      #  run: |
-      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
-
-      #- name: Test on igpu for Mistral (2048-256)
-      #  shell: cmd
-      #  run: |
-      #    call conda activate igpu-perf
-      #    pip install transformers==4.34.0
-
-      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-      #    set SYCL_CACHE_PERSISTENT=1
-      #    set BIGDL_LLM_XMX_DISABLED=1
-
-      #    cd python\llm\dev\benchmark\all-in-one
-      #    move ..\..\..\test\benchmark\igpu-perf\2048-256_434.yaml config.yaml
-      #    set PYTHONIOENCODING=utf-8
-      #    python run.py >> %CSV_SAVE_PATH%\2048-256\log\%LOG_FILE% 2>&1
-      #    if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #    call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (2048-256)
         shell: bash
         run: |
@@ -709,30 +622,6 @@ jobs:
 
           call conda deactivate
 
-      #- name: Prepare igpu perf test for Mistral (load_low_bit 1024-128)
-      #  shell: bash
-      #  run: |
-      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
-
-      #- name: Test on igpu for Mistral (load_low_bit 1024-128)
-      #  shell: cmd
-      #  run: |
-      #    call conda activate igpu-perf
-      #    pip install transformers==4.34.0
-
-      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-      #    set SYCL_CACHE_PERSISTENT=1
-      #    set BIGDL_LLM_XMX_DISABLED=1
-
-      #    cd python\llm\dev\benchmark\all-in-one
-      #    move ..\..\..\test\benchmark\igpu-perf\1024-128_loadlowbit_434.yaml config.yaml
-      #    set PYTHONIOENCODING=utf-8
-      #    python run.py >> %CSV_SAVE_PATH%\1024-128_loadlowbit\log\%LOG_FILE% 2>&1
-      #    if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #    call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (load_low_bit 1024-128)
         shell: bash
         run: |
@@ -800,30 +689,6 @@ jobs:
 
           call conda deactivate
 
-      #- name: Prepare igpu perf test for Mistral (int4+fp16 1024-128)
-      #  shell: bash
-      #  run: |
-      #    sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #    sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
-
-      #- name: Test on igpu for Mistral (int4+fp16 1024-128)
-      #  shell: cmd
-      #  run: |
-      #    call conda activate igpu-perf
-      #    pip install transformers==4.34.0
-
-      #    call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-      #    set SYCL_CACHE_PERSISTENT=1
-      #    set BIGDL_LLM_XMX_DISABLED=1
-
-      #    cd python\llm\dev\benchmark\all-in-one
-      #    move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_434.yaml config.yaml
-      #    set PYTHONIOENCODING=utf-8
-      #    python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-      #    if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #    call conda deactivate
-
       - name: Prepare igpu perf test for Qwen 1.5 (int4+fp16 1024-128)
         shell: bash
         run: |
diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 583cbf2c31e..7a4d4bf6d6c 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -367,9 +367,7 @@ jobs:
             source /home/arda/intel/oneapi/setvars.sh
           fi
           python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
-          bash python/llm/test/run-llm-inference-tests-gpu.sh
-          # python -m pip install transformers==4.34.0
-          # bash python/llm/test/run-llm-inference-tests-gpu-434.sh
+          bash python/llm/test/run-llm-inference-tests-gpu.sh4.sh
 
       - name: Run LLM example tests
         shell: bash
diff --git a/python/llm/test/benchmark/arc-perf-transformers-434.yaml b/python/llm/test/benchmark/arc-perf-transformers-434.yaml
deleted file mode 100644
index 1389e44ab5a..00000000000
--- a/python/llm/test/benchmark/arc-perf-transformers-434.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# For the models that require transformers 4.34.0
-repo_id:
-  - 'mistralai/Mistral-7B-v0.1'
-local_model_hub: '/mnt/disk1/models'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-  - '1024-128'
-  - '2048-256'
-test_api:
-  - "transformer_int4_gpu"  # on Intel GPU
-cpu_embedding: False # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/run-llm-inference-tests-gpu-434.sh b/python/llm/test/run-llm-inference-tests-gpu-434.sh
deleted file mode 100644
index 91a1676ddf8..00000000000
--- a/python/llm/test/run-llm-inference-tests-gpu-434.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-export ANALYTICS_ZOO_ROOT=${ANALYTICS_ZOO_ROOT}
-export LLM_HOME=${ANALYTICS_ZOO_ROOT}/python/llm/src
-export LLM_INFERENCE_TEST_DIR=${ANALYTICS_ZOO_ROOT}/python/llm/test/inference_gpu
-
-export USE_XETLA=OFF
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export DEVICE='xpu'
-
-set -e
-
-echo "# Start testing inference"
-start=$(date "+%s")
-
-# if [ -z "$THREAD_NUM" ]; then
-#   THREAD_NUM=2
-# fi
-# export OMP_NUM_THREADS=$THREAD_NUM
-export BIGDL_LLM_XMX_DISABLED=1
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_attention.py -v -s -k "Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_mlp.py -v -s -k "Mistral"
-pytest ${LLM_INFERENCE_TEST_DIR}/test_transformers_api_RMSNorm.py -v -s -k "Mistral"
-unset BIGDL_LLM_XMX_DISABLED
-
-now=$(date "+%s")
-time=$((now-start))
-
-echo "Bigdl-llm gpu inference tests for transformers 4.34.0 finished"
-echo "Time used:$time seconds"

From 4af144535ed413e8ec50785fabc6883f38aaf0b3 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 8 May 2024 12:23:37 -0700
Subject: [PATCH 50/57] update

---
 .github/workflows/llm_performance_tests.yml | 3 +--
 .github/workflows/llm_unit_tests.yml        | 8 ++------
 python/llm/setup.py                         | 5 ++---
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 3bb6ab6950b..17e589e5d4b 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -331,7 +331,7 @@ jobs:
 
           pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
           if %ERRORLEVEL% neq 0 (exit /b 1)
-          pip install transformers==4.36.2
+          # pip install transformers==4.36.2
           pip list
 
           call conda deactivate
@@ -359,7 +359,6 @@ jobs:
       #      echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
       #      exit /b 1
       #    )
-      #    pip install transformers==4.36.2
       #    pip list
 
       #    call conda deactivate
diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 7a4d4bf6d6c..8b6c1426af1 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -210,12 +210,7 @@ jobs:
       - name: Run LLM cli test (Windows)
         if: runner.os == 'Windows' 
         uses: ./.github/actions/llm/cli-test-windows
-      # - name: Run LLM inference test
-        # shell: bash
-        # run: |
-          # python -m pip install einops datasets librosa openai-whisper
-          # bash python/llm/test/run-llm-inference-tests.sh
-      - name: Run LLM inference test for 4.36
+      - name: Run LLM inference test
         shell: bash
         run: |
           python -m pip install einops datasets librosa openai-whisper
@@ -239,6 +234,7 @@ jobs:
         shell: bash
         run: |
           pip uninstall sentence-transformers -y || true
+
   llm-unit-test-on-arc:
     needs: [setup-python-version, llm-cpp-build]
     strategy:
diff --git a/python/llm/setup.py b/python/llm/setup.py
index ceadccdffa2..cc94a0e22e5 100644
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@@ -52,7 +52,7 @@
 libs_dir = os.path.join(llm_home, "ipex_llm", "libs")
 CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error
                'torch',
-               'transformers == 4.31.0', 'sentencepiece', 'tokenizers == 0.13.3',
+               'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.13.3',
                # TODO: Support accelerate 0.22.0
                'accelerate == 0.21.0', 'tabulate']
 SERVING_DEP = ['fschat[model_worker, webui] == 0.2.36', 'protobuf']
@@ -277,10 +277,9 @@ def setup_package():
 
     # Add internal requires for llama-index
     llama_index_requires = copy.deepcopy(all_requires)
-    for exclude_require in ['torch', 'transformers == 4.31.0', 'tokenizers == 0.13.3']:
+    for exclude_require in ['torch', 'tokenizers == 0.13.3']:
         llama_index_requires.remove(exclude_require)
     llama_index_requires += ["torch<2.2.0",
-                             "transformers>=4.34.0,<4.39.0",
                              "sentence-transformers~=2.6.1"]
 
 

From 1f9135368ac069870892bf2a4771555ce204f2e8 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 8 May 2024 12:25:12 -0700
Subject: [PATCH 51/57] update

---
 .github/actions/llm/setup-llm-env/action.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/actions/llm/setup-llm-env/action.yml b/.github/actions/llm/setup-llm-env/action.yml
index 4d0b7550f74..4b25ea0c401 100644
--- a/.github/actions/llm/setup-llm-env/action.yml
+++ b/.github/actions/llm/setup-llm-env/action.yml
@@ -42,4 +42,3 @@ runs:
           pip install pytest
           bash python/llm/test/run-llm-install-tests.sh
         fi
-        pip install transformers==4.36.2

From 069649186a90024fadfa15a2f3ea467b2ab5049b Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 8 May 2024 12:30:34 -0700
Subject: [PATCH 52/57] update

---
 .github/workflows/llm_unit_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llm_unit_tests.yml b/.github/workflows/llm_unit_tests.yml
index 8b6c1426af1..55bfcaad685 100644
--- a/.github/workflows/llm_unit_tests.yml
+++ b/.github/workflows/llm_unit_tests.yml
@@ -363,7 +363,7 @@ jobs:
             source /home/arda/intel/oneapi/setvars.sh
           fi
           python -m pip install datasets librosa soundfile einops tiktoken transformers_stream_generator
-          bash python/llm/test/run-llm-inference-tests-gpu.sh4.sh
+          bash python/llm/test/run-llm-inference-tests-gpu.sh
 
       - name: Run LLM example tests
         shell: bash

From 6922dc772fc9ef6d2480d11dcacf0b75130b1d5a Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 8 May 2024 15:30:12 -0700
Subject: [PATCH 53/57] update

---
 python/llm/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/llm/setup.py b/python/llm/setup.py
index cc94a0e22e5..7dd88fd1635 100644
--- a/python/llm/setup.py
+++ b/python/llm/setup.py
@@ -52,7 +52,7 @@
 libs_dir = os.path.join(llm_home, "ipex_llm", "libs")
 CONVERT_DEP = ['numpy == 1.26.4', # lastet 2.0.0b1 will cause error
                'torch',
-               'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.13.3',
+               'transformers == 4.36.2', 'sentencepiece', 'tokenizers == 0.15.2',
                # TODO: Support accelerate 0.22.0
                'accelerate == 0.21.0', 'tabulate']
 SERVING_DEP = ['fschat[model_worker, webui] == 0.2.36', 'protobuf']
@@ -277,7 +277,7 @@ def setup_package():
 
     # Add internal requires for llama-index
     llama_index_requires = copy.deepcopy(all_requires)
-    for exclude_require in ['torch', 'tokenizers == 0.13.3']:
+    for exclude_require in ['torch']:
         llama_index_requires.remove(exclude_require)
     llama_index_requires += ["torch<2.2.0",
                              "sentence-transformers~=2.6.1"]

From 641772688792e47205ec691fc7bfa6121984f6e2 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Tue, 14 May 2024 14:33:56 -0700
Subject: [PATCH 54/57] update

---
 .github/workflows/llm-whisper-evaluation.yml         |  2 +-
 .../llm_tests_for_stable_version_on_arc.yml          | 12 ++++++------
 .../llm_tests_for_stable_version_on_spr.yml          | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/llm-whisper-evaluation.yml b/.github/workflows/llm-whisper-evaluation.yml
index c26d66e726f..e60eadbf1df 100644
--- a/.github/workflows/llm-whisper-evaluation.yml
+++ b/.github/workflows/llm-whisper-evaluation.yml
@@ -75,7 +75,7 @@ jobs:
             echo "runner=$runner" >> $GITHUB_OUTPUT
 
   llm-whisper-evaluation:
-    #if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-whisper-evaluation' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-whisper-evaluation' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
     needs: [llm-cpp-build, set-matrix] # please uncomment it for PR tests
     # needs: [set-matrix] # please comment it for PR tests
     strategy:
diff --git a/.github/workflows/llm_tests_for_stable_version_on_arc.yml b/.github/workflows/llm_tests_for_stable_version_on_arc.yml
index 297236af77d..1b8c48d972d 100644
--- a/.github/workflows/llm_tests_for_stable_version_on_arc.yml
+++ b/.github/workflows/llm_tests_for_stable_version_on_arc.yml
@@ -10,12 +10,12 @@ permissions:
 
 # Controls when the action will run.
 on:
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm_performance_tests.yml"
-      - "python/llm/test/benchmark/**"
-      - "python/llm/dev/benchmark/all-in-one/**"
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #     - ".github/workflows/llm_performance_tests.yml"
+  #     - "python/llm/test/benchmark/**"
+  #     - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
diff --git a/.github/workflows/llm_tests_for_stable_version_on_spr.yml b/.github/workflows/llm_tests_for_stable_version_on_spr.yml
index 8a18984cf5a..ef02ce07524 100644
--- a/.github/workflows/llm_tests_for_stable_version_on_spr.yml
+++ b/.github/workflows/llm_tests_for_stable_version_on_spr.yml
@@ -10,12 +10,12 @@ permissions:
 
 # Controls when the action will run.
 on:
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm_performance_tests.yml"
-      - "python/llm/test/benchmark/**"
-      - "python/llm/dev/benchmark/all-in-one/**"
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #    - ".github/workflows/llm_performance_tests.yml"
+  #    - "python/llm/test/benchmark/**"
+  #    - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 

From ec2cd5e5a1241d078379d0b3195dd3a063d7fa78 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Wed, 15 May 2024 14:24:37 -0700
Subject: [PATCH 55/57] update

---
 .../llm/test/benchmark/igpu-perf/1024-128_434.yaml  | 13 -------------
 .../benchmark/igpu-perf/1024-128_int4_fp16_434.yaml | 13 -------------
 .../igpu-perf/1024-128_loadlowbit_434.yaml          | 13 -------------
 .../llm/test/benchmark/igpu-perf/2048-256_434.yaml  | 13 -------------
 python/llm/test/benchmark/igpu-perf/32-32_434.yaml  | 13 -------------
 5 files changed, 65 deletions(-)
 delete mode 100644 python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
 delete mode 100644 python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
 delete mode 100644 python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
 delete mode 100644 python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
 delete mode 100644 python/llm/test/benchmark/igpu-perf/32-32_434.yaml

diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
deleted file mode 100644
index b4b1e9b7a4f..00000000000
--- a/python/llm/test/benchmark/igpu-perf/1024-128_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-128'
-test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
deleted file mode 100644
index 57f0a3d3c8e..00000000000
--- a/python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-128'
-test_api:
-  - "transformer_int4_fp16_gpu_win" # on Intel GPU for Windows, use fp16 for non-linear layer
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml b/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
deleted file mode 100644
index 51453bd1b6a..00000000000
--- a/python/llm/test/benchmark/igpu-perf/1024-128_loadlowbit_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '1024-128'
-test_api:
-  - "transformer_int4_loadlowbit_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml b/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
deleted file mode 100644
index b16e5493017..00000000000
--- a/python/llm/test/benchmark/igpu-perf/2048-256_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 1
-num_trials: 3
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '2048-256'
-test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)
diff --git a/python/llm/test/benchmark/igpu-perf/32-32_434.yaml b/python/llm/test/benchmark/igpu-perf/32-32_434.yaml
deleted file mode 100644
index 6b5c4229b54..00000000000
--- a/python/llm/test/benchmark/igpu-perf/32-32_434.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-repo_id:
-  - 'mistralai/Mistral-7B-Instruct-v0.1'
-local_model_hub: 'path to your local model hub'
-warm_up: 3
-num_trials: 5
-num_beams: 1 # default to greedy search
-low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
-batch_size: 1 # default to 1
-in_out_pairs:
-  - '32-32'
-test_api:
-  - "transformer_int4_gpu_win" # on Intel GPU for Windows (catch GPU peak memory)
-cpu_embedding: True # whether put embedding to CPU (only avaiable now for gpu win related test_api)

From dcd8115c33adb6beb498648d3fa5fc5f9476f266 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 23 May 2024 16:14:47 -0700
Subject: [PATCH 56/57] revert

---
 .github/workflows/llm_performance_tests.yml   | 200 +++++++++---------
 .../llm_tests_for_stable_version_on_spr.yml   |   6 +-
 .../inference/test_transformesr_api_434.py    |  80 -------
 3 files changed, 103 insertions(+), 183 deletions(-)
 delete mode 100644 python/llm/test/inference/test_transformesr_api_434.py

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index ef600ca4ece..f7984b800ec 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -13,23 +13,23 @@ on:
   schedule:
     - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
   # please uncomment it for PR tests
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/llm_performance_tests.yml"
-      - "python/llm/test/benchmark/**"
-      - "python/llm/dev/benchmark/all-in-one/**"
+  # pull_request:
+  #  branches: [main]
+  #  paths:
+  #    - ".github/workflows/llm_performance_tests.yml"
+  #    - "python/llm/test/benchmark/**"
+  #    - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
-  llm-cpp-build: # please uncomment it for PR tests
-    uses: ./.github/workflows/llm-binary-build.yml
+  # llm-cpp-build: # please uncomment it for PR tests
+  #   uses: ./.github/workflows/llm-binary-build.yml
 
   llm-performance-test-on-arc:
-    #if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    needs: llm-cpp-build # please uncomment it for PR tests
+    if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    # needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -63,23 +63,23 @@ jobs:
           python -m pip install --upgrade tiktoken
 
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      - name: Download llm binary
-        uses: ./.github/actions/llm/download-llm-binary
+      # - name: Download llm binary
+      #   uses: ./.github/actions/llm/download-llm-binary
 
-      - name: Run LLM install (all) test
-        uses: ./.github/actions/llm/setup-llm-env
-        with:
-          extra-dependency: "xpu_2.1"
-
-      #- name: Install IPEX-LLM from Pypi
-      #  shell: bash
-      #  run: |
-      #    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-      #    if ! pip show ipex-llm | grep $test_version_date; then
-      #      echo "Did not install ipex-llm with excepted version $test_version_date"
-      #      exit 1
-      #    fi
+      # - name: Run LLM install (all) test
+      #   uses: ./.github/actions/llm/setup-llm-env
+      #   with:
+      #    extra-dependency: "xpu_2.1"
+
+      - name: Install IPEX-LLM from Pypi
+        shell: bash
+        run: |
+          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+          if ! pip show ipex-llm | grep $test_version_date; then
+            echo "Did not install ipex-llm with excepted version $test_version_date"
+            exit 1
+          fi
 
       - name: Test installed xpu version
         shell: bash
@@ -143,8 +143,8 @@ jobs:
           fi
           
   llm-performance-test-on-spr:
-    #if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-spr' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    needs: llm-cpp-build # please uncomment it for PR tests
+    if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-spr' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    # needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -174,21 +174,21 @@ jobs:
           python -m pip install --upgrade transformers_stream_generator
 
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      - name: Download llm binary
-        uses: ./.github/actions/llm/download-llm-binary
-
-      - name: Run LLM install (all) test
-        uses: ./.github/actions/llm/setup-llm-env
-
-      # - name: Install IPEX-LLM from Pypi
-      #  shell: bash
-      #  run: |
-      #    pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
-      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-      #    if ! pip show ipex-llm | grep $test_version_date; then
-      #      echo "Did not install ipex-llm with excepted version $test_version_date"
-      #      exit 1
-      #    fi
+      #- name: Download llm binary
+      #  uses: ./.github/actions/llm/download-llm-binary
+
+      #- name: Run LLM install (all) test
+      #  uses: ./.github/actions/llm/setup-llm-env
+
+      - name: Install IPEX-LLM from Pypi
+        shell: bash
+        run: |
+          pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu
+          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+          if ! pip show ipex-llm | grep $test_version_date; then
+            echo "Did not install ipex-llm with excepted version $test_version_date"
+            exit 1
+          fi
 
       - name: Test on cpu
         shell: bash
@@ -215,8 +215,8 @@ jobs:
           done
 
   llm-performance-test-on-core:
-    #if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-core' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    needs: llm-cpp-build # please uncomment it for PR tests
+    if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-core' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    # needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -248,21 +248,21 @@ jobs:
           python -m pip install --upgrade tiktoken einops transformers_stream_generator
     
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      - name: Download llm binary
-        uses: ./.github/actions/llm/download-llm-binary
+      # - name: Download llm binary
+      #  uses: ./.github/actions/llm/download-llm-binary
 
-      - name: Run LLM install (all) test
-        uses: ./.github/actions/llm/setup-llm-env
+      #- name: Run LLM install (all) test
+      #  uses: ./.github/actions/llm/setup-llm-env
 
-      # - name: Install IPEX-LLM from Pypi
-      #  shell: bash
-      #  run: |
-      #    pip install --pre --upgrade ipex-llm[all]
-      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-      #    if ! pip show ipex-llm | grep $test_version_date; then
-      #      echo "Did not install ipex-llm with excepted version $test_version_date"
-      #      exit 1
-      #    fi
+      - name: Install IPEX-LLM from Pypi
+        shell: bash
+        run: |
+          pip install --pre --upgrade ipex-llm[all]
+          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+          if ! pip show ipex-llm | grep $test_version_date; then
+            echo "Did not install ipex-llm with excepted version $test_version_date"
+            exit 1
+          fi
 
 
       - name: Test on core ${{ matrix.platform }}
@@ -288,8 +288,8 @@ jobs:
           fi
 
   llm-performance-test-on-igpu:
-    #if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-igpu' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
-    needs: llm-cpp-build # please uncomment it for PR tests
+    if: ${{ github.event.schedule || github.event_name == 'workflow_dispatch' || github.event.inputs.artifact == 'llm-performance-test-on-igpu' || github.event.inputs.artifact == 'all' }} # please comment it for PR tests
+    # needs: llm-cpp-build # please uncomment it for PR tests
     strategy:
       fail-fast: false
       matrix:
@@ -305,16 +305,44 @@ jobs:
       # TODO: Put the ipex-llm related install process for win gpu into a action function
 
       # Please uncomment it and commment the install from pypi for PR tests
-      - name: Download llm binary
-        uses: ./.github/actions/llm/download-llm-binary
+      # - name: Download llm binary
+      #   uses: ./.github/actions/llm/download-llm-binary
+
+      # - name: Prepare for install ipex-llm from source
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
+      #     sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
 
-      - name: Prepare for install ipex-llm from source
+      # - name: Install ipex-llm and other related packages (install from source)
+      #   shell: cmd
+      #   run: |
+      #     call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
+      #     call conda activate igpu-perf
+
+      #     pip install --upgrade pip
+      #     pip install --upgrade wheel
+      #     pip install --upgrade omegaconf pandas
+      #     pip install --upgrade tiktoken einops transformers_stream_generator
+
+      #     cd python\llm
+      #     python setup.py clean --all bdist_wheel --win
+      #     if not exist dist\ipex_llm*.whl (exit /b 1)
+      #     for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
+
+      #     pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #    pip list
+
+      #    call conda deactivate
+
+      - name: Determine desired ipex-llm version
         shell: bash
         run: |
-          sed -i 's/"bigdl-core-xe-21==" + CORE_XE_VERSION/"bigdl-core-xe-21"/g' python/llm/setup.py
-          sed -i 's/"bigdl-core-xe-esimd-21==" + CORE_XE_VERSION/"bigdl-core-xe-esimd-21"/g' python/llm/setup.py
+          test_version_date=`date -d 'yesterday' '+%Y%m%d'`
+          echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
 
-      - name: Install ipex-llm and other related packages (install from source)
+      - name: Install ipex-llm and other related packages (install from pypi)
         shell: cmd
         run: |
           call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
@@ -325,44 +353,16 @@ jobs:
           pip install --upgrade omegaconf pandas
           pip install --upgrade tiktoken einops transformers_stream_generator
 
-          cd python\llm
-          python setup.py clean --all bdist_wheel --win
-          if not exist dist\ipex_llm*.whl (exit /b 1)
-          for %%i in (dist\ipex_llm*.whl) do set whl_name=%%i
-
-          pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+          pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+          pip show ipex-llm | findstr %TEST_VERSION_DATE%
+          if %ERRORLEVEL% neq 0 (
+            echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
+            exit /b 1
+          )
           pip list
 
           call conda deactivate
 
-      #- name: Determine desired ipex-llm version
-      #  shell: bash
-      #  run: |
-      #    test_version_date=`date -d 'yesterday' '+%Y%m%d'`
-      #    echo "TEST_VERSION_DATE=${test_version_date}" >> "$GITHUB_ENV"
-
-      #- name: Install ipex-llm and other related packages (install from pypi)
-      #  shell: cmd
-      #  run: |
-      #    call conda create -n igpu-perf python=${{ matrix.python-version }} libuv -y
-      #    call conda activate igpu-perf
-
-      #    pip install --upgrade pip
-      #    pip install --upgrade wheel
-      #    pip install --upgrade omegaconf pandas
-      #    pip install --upgrade tiktoken einops transformers_stream_generator
-
-      #    pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-      #    pip show ipex-llm | findstr %TEST_VERSION_DATE%
-      #    if %ERRORLEVEL% neq 0 (
-      #      echo "Did not install ipex-llm with excepted version %TEST_VERSION_DATE%"
-      #      exit /b 1
-      #    )
-      #    pip list
-
-      #    call conda deactivate
-
       - name: Create env for html generation
         shell: cmd
         run: |
diff --git a/.github/workflows/llm_tests_for_stable_version_on_spr.yml b/.github/workflows/llm_tests_for_stable_version_on_spr.yml
index ef02ce07524..d852499c57b 100644
--- a/.github/workflows/llm_tests_for_stable_version_on_spr.yml
+++ b/.github/workflows/llm_tests_for_stable_version_on_spr.yml
@@ -13,9 +13,9 @@ on:
   # pull_request:
   #   branches: [main]
   #   paths:
-  #    - ".github/workflows/llm_performance_tests.yml"
-  #    - "python/llm/test/benchmark/**"
-  #    - "python/llm/dev/benchmark/all-in-one/**"
+  #     - ".github/workflows/llm_performance_tests.yml"
+  #     - "python/llm/test/benchmark/**"
+  #     - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
diff --git a/python/llm/test/inference/test_transformesr_api_434.py b/python/llm/test/inference/test_transformesr_api_434.py
deleted file mode 100644
index 4de49e660ae..00000000000
--- a/python/llm/test/inference/test_transformesr_api_434.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#
-# Copyright 2016 The BigDL Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import pytest
-import tempfile
-import torch
-
-from ipex_llm.transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-
-
-mistral_model_path = os.environ.get('MISTRAL_ORIGIN_PATH')
-
-prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"
-
-@pytest.mark.parametrize("Model, Tokenizer, model_path, prompt", [
-    (AutoModelForCausalLM, AutoTokenizer, mistral_model_path, prompt)
-])
-    
-def test_optimize_model(Model, Tokenizer, model_path, prompt):
-    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
-    input_ids = tokenizer.encode(prompt, return_tensors="pt")
-
-    model = Model.from_pretrained(model_path,
-                                load_in_4bit=True,
-                                optimize_model=False,
-                                trust_remote_code=True)
-    logits_base_model = (model(input_ids)).logits
-
-    model = Model.from_pretrained(model_path,
-                                load_in_4bit=True,
-                                optimize_model=True,
-                                trust_remote_code=True)
-    logits_optimized_model = (model(input_ids)).logits
-    diff = abs(logits_base_model - logits_optimized_model).flatten()
-
-    assert any(diff) is False
-
-@pytest.mark.parametrize('prompt, answer', [
-    ('What is the capital of France?\n\n', 'Paris')
-    ])
-@pytest.mark.parametrize('Model, Tokenizer, model_path',[
-    (AutoModelForCausalLM, AutoTokenizer, mistral_model_path),
-    ])
-def test_load_low_bit_completion(Model, Tokenizer, model_path, prompt, answer):
-    tokenizer = Tokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = Model.from_pretrained(model_path,
-                                  load_in_4bit=True,
-                                  optimize_model=True,
-                                  trust_remote_code=True)
-    
-    with tempfile.TemporaryDirectory() as tempdir:
-        model.save_low_bit(tempdir)
-        loaded_model = Model.load_low_bit(tempdir,
-                                          optimize_model=True,
-                                          trust_remote_code=True)
-
-        with torch.inference_mode():
-            input_ids = tokenizer.encode(prompt, return_tensors="pt")
-            output = loaded_model.generate(input_ids, max_new_tokens=32)
-            output_str = tokenizer.decode(output[0], skip_special_tokens=True)
-
-            assert answer in output_str
-
-if __name__ == '__main__':
-    pytest.main([__file__])

From 936fafe1aae315b3594a49f8053b34b7c93fcfd5 Mon Sep 17 00:00:00 2001
From: jenniew <jenniewang123@gmail.com>
Date: Thu, 23 May 2024 17:10:06 -0700
Subject: [PATCH 57/57] update

---
 .github/workflows/llm_performance_tests.yml | 30 ++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index f7984b800ec..73098d4dffa 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -14,11 +14,11 @@ on:
     - cron: "30 16 * * *" # GMT time, 16:30 GMT == 00:30 China
   # please uncomment it for PR tests
   # pull_request:
-  #  branches: [main]
-  #  paths:
-  #    - ".github/workflows/llm_performance_tests.yml"
-  #    - "python/llm/test/benchmark/**"
-  #    - "python/llm/dev/benchmark/all-in-one/**"
+  #   branches: [main]
+  #   paths:
+  #     - ".github/workflows/llm_performance_tests.yml"
+  #     - "python/llm/test/benchmark/**"
+  #     - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
@@ -69,7 +69,7 @@ jobs:
       # - name: Run LLM install (all) test
       #   uses: ./.github/actions/llm/setup-llm-env
       #   with:
-      #    extra-dependency: "xpu_2.1"
+      #     extra-dependency: "xpu_2.1"
 
       - name: Install IPEX-LLM from Pypi
         shell: bash
@@ -174,11 +174,11 @@ jobs:
           python -m pip install --upgrade transformers_stream_generator
 
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
-      #- name: Download llm binary
-      #  uses: ./.github/actions/llm/download-llm-binary
+      # - name: Download llm binary
+      #   uses: ./.github/actions/llm/download-llm-binary
 
-      #- name: Run LLM install (all) test
-      #  uses: ./.github/actions/llm/setup-llm-env
+      # - name: Run LLM install (all) test
+      #   uses: ./.github/actions/llm/setup-llm-env
 
       - name: Install IPEX-LLM from Pypi
         shell: bash
@@ -249,10 +249,10 @@ jobs:
     
       # please uncomment it and comment the "Install IPEX-LLM from Pypi" part for PR tests
       # - name: Download llm binary
-      #  uses: ./.github/actions/llm/download-llm-binary
+      #   uses: ./.github/actions/llm/download-llm-binary
 
-      #- name: Run LLM install (all) test
-      #  uses: ./.github/actions/llm/setup-llm-env
+      # - name: Run LLM install (all) test
+      #   uses: ./.github/actions/llm/setup-llm-env
 
       - name: Install IPEX-LLM from Pypi
         shell: bash
@@ -332,9 +332,9 @@ jobs:
 
       #     pip install --pre --upgrade %whl_name%[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
       #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #    pip list
+      #     pip list
 
-      #    call conda deactivate
+      #     call conda deactivate
 
       - name: Determine desired ipex-llm version
         shell: bash