From 8a1552cedd31b981555218cbe3cecc2463051cd8 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 8 Nov 2024 20:19:09 +0000
Subject: [PATCH 01/19] Rename _vmfb to _iree

---
 .github/workflows/ci_eval.yaml                | 12 +++----
 ...{perplexity_vmfb.py => perplexity_iree.py} |  0
 .../evaluate/baseline_perplexity_scores.json  |  2 +-
 ...y_vmfb_test.py => perplexity_iree_test.py} | 34 +++++++++----------
 4 files changed, 24 insertions(+), 24 deletions(-)
 rename sharktank/sharktank/evaluate/{perplexity_vmfb.py => perplexity_iree.py} (100%)
 rename sharktank/tests/evaluate/{perplexity_vmfb_test.py => perplexity_iree_test.py} (92%)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 4c98bf79b..484f1bed1 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -21,9 +21,9 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test_perplexity_vmfb:
+  test_perplexity_iree:
     timeout-minutes: 1000
-    name: "IREE/vmfb"
+    name: "Perplexity-IREE"
     strategy:
       matrix:
         version: [3.11]
@@ -74,12 +74,12 @@ jobs:
             iree-base-runtime \
             "numpy<2.0"
 
-      - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+      - name: Run perplexity test with IREE
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
 
   test_perplexity_torch:
     timeout-minutes: 1000
-    name: "Torch/eager mode"
+    name: "Perplexity-Torch"
     strategy:
       matrix:
         version: [3.11]
@@ -122,5 +122,5 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
-      - name: Run perplexity test in eager mode
+      - name: Run perplexity test with Torch
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_iree.py
similarity index 100%
rename from sharktank/sharktank/evaluate/perplexity_vmfb.py
rename to sharktank/sharktank/evaluate/perplexity_iree.py
diff --git a/sharktank/tests/evaluate/baseline_perplexity_scores.json b/sharktank/tests/evaluate/baseline_perplexity_scores.json
index ac2cd7b83..24511b05f 100644
--- a/sharktank/tests/evaluate/baseline_perplexity_scores.json
+++ b/sharktank/tests/evaluate/baseline_perplexity_scores.json
@@ -210,7 +210,7 @@
     ],
     "mean_perplexity": 6.060831
   },
-  "llama3_8B_f16_decomposed_vmfb": {
+  "llama3_8B_f16_decomposed_iree": {
     "perplexities": [
       6.651368,
       22.059452,
diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_iree_test.py
similarity index 92%
rename from sharktank/tests/evaluate/perplexity_vmfb_test.py
rename to sharktank/tests/evaluate/perplexity_iree_test.py
index 93ffbe61c..8cf2055c9 100644
--- a/sharktank/tests/evaluate/perplexity_vmfb_test.py
+++ b/sharktank/tests/evaluate/perplexity_iree_test.py
@@ -8,7 +8,7 @@
 import pytest
 import json
 
-from sharktank.evaluate import perplexity_vmfb
+from sharktank.evaluate import perplexity_iree
 
 longrun = pytest.mark.skipif("not config.getoption('longrun')")
 
@@ -32,10 +32,10 @@ def test_llama3_8B_f16_decomposed(self):
 
         # Llama 3.1 8B decomposed
 
-        model_name = "llama3_8B_f16_decomposed_vmfb"
+        model_name = "llama3_8B_f16_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -67,10 +67,10 @@ def test_llama3_8B_f16(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_f16_vmfb"
+        model_name = "llama3_8B_f16_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -102,10 +102,10 @@ def test_llama3_8B_fp8_decomposed(self):
 
         # Llama 3.1 8B decomposed
 
-        model_name = "llama3_8B_fp8_decomposed_vmfb"
+        model_name = "llama3_8B_fp8_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -137,10 +137,10 @@ def test_llama3_8B_fp8(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_fp8_vmfb"
+        model_name = "llama3_8B_fp8_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -172,10 +172,10 @@ def test_llama3_405B_f16_decomposed(self):
 
         # Llama 3.1 405B decomposed
 
-        model_name = "llama3_405B_f16_decomposed_vmfb"
+        model_name = "llama3_405B_f16_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -207,10 +207,10 @@ def test_llama3_405B_f16(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_f16_vmfb"
+        model_name = "llama3_405B_f16_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -242,10 +242,10 @@ def test_llama3_405B_fp8_decomposed(self):
 
         # Llama 3.1 405B decomposed
 
-        model_name = "llama3_405B_fp8_decomposed_vmfb"
+        model_name = "llama3_405B_fp8_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -277,10 +277,10 @@ def test_llama3_405B_fp8(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_fp8_vmfb"
+        model_name = "llama3_405B_fp8_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",

From c69d09b4f5006efdc56b1b45b4454e845ac419db Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 8 Nov 2024 20:21:50 +0000
Subject: [PATCH 02/19] Add perplexity scoreboard and description

---
 sharktank/sharktank/evaluate/README.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
index 784bb24fd..640d5e630 100644
--- a/sharktank/sharktank/evaluate/README.md
+++ b/sharktank/sharktank/evaluate/README.md
@@ -9,16 +9,31 @@ pip install -r sharktank/requirements-tests.txt
 
 ### Perplexity
 
+Perplexity score measures the ability of a language model to predict the next token in a sequence. A lower score indicates that a model has higher certainty in it's predictions. Perplexity acts as an intrinsic evaluation metric that measures the model quality, independent of any downstream task.
+
+In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
+
 Test perplexity for Llama3.1 8B & 405B (FP16 & FP8) models:
 
 ```bash
 pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
 ```
 
-Get perplexity for a new model:
+Calculate the perplexity for a new model:
 
 ```bash
 python -m  sharktank.evaluate.perplexity \
   --gguf-file=llama3_70b_f16.gguf \
   --tokenizer-config-json=tokenizer_config.json
 ```
+
+### LLaMA 3.1 Scoreboard
+
+| CPU            | GPU        |
+|:---------------|:-----------|
+| AMD EPYC 9554  | MI300X     |
+
+
+|Models   |Model size (GB) |Torch      |IREE       |
+|:--------|:---------------|:----------|:----------|
+|8B f16   |16.07           |14.930181  |14.991893  |

From 2e286c8b33bd8b916b3753b29157c44b20e08a06 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 8 Nov 2024 20:38:55 +0000
Subject: [PATCH 03/19] README updates

---
 sharktank/sharktank/evaluate/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
index 640d5e630..b294e233c 100644
--- a/sharktank/sharktank/evaluate/README.md
+++ b/sharktank/sharktank/evaluate/README.md
@@ -13,13 +13,13 @@ Perplexity score measures the ability of a language model to predict the next to
 
 In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
 
-Test perplexity for Llama3.1 8B & 405B (FP16 & FP8) models:
+* Test perplexity for Llama3.1 8B (FP16) model:
 
 ```bash
 pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
 ```
 
-Calculate the perplexity for a new model:
+* Calculate the perplexity for a new model:
 
 ```bash
 python -m  sharktank.evaluate.perplexity \
@@ -30,7 +30,7 @@ python -m  sharktank.evaluate.perplexity \
 ### LLaMA 3.1 Scoreboard
 
 | CPU            | GPU        |
-|:---------------|:-----------|
+|:-------------: |:----------:|
 | AMD EPYC 9554  | MI300X     |
 
 

From f1b167f26c63a32994af1ca6e3fece85fe753284 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 8 Nov 2024 20:39:44 +0000
Subject: [PATCH 04/19] Add perplexity to github.io dashboard

---
 .github/workflows/ci_eval.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 484f1bed1..66f782407 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -77,6 +77,12 @@ jobs:
       - name: Run perplexity test with IREE
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
 
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out
+
   test_perplexity_torch:
     timeout-minutes: 1000
     name: "Perplexity-Torch"
@@ -124,3 +130,9 @@ jobs:
 
       - name: Run perplexity test with Torch
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./out

From d79236a249e7a6324f8b6ca5772c7a25fae1c120 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 8 Nov 2024 20:42:44 +0000
Subject: [PATCH 05/19] README updates

---
 sharktank/sharktank/evaluate/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
index b294e233c..448aa60ca 100644
--- a/sharktank/sharktank/evaluate/README.md
+++ b/sharktank/sharktank/evaluate/README.md
@@ -11,7 +11,7 @@ pip install -r sharktank/requirements-tests.txt
 
 Perplexity score measures the ability of a language model to predict the next token in a sequence. A lower score indicates that a model has higher certainty in it's predictions. Perplexity acts as an intrinsic evaluation metric that measures the model quality, independent of any downstream task.
 
-In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
+In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts randomly selected from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
 
 * Test perplexity for Llama3.1 8B (FP16) model:
 
@@ -19,7 +19,7 @@ In SHARK-Platform, we use perplexity to track code regressions and quality loss
 pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
 ```
 
-* Calculate the perplexity for a new model:
+* Calculate perplexity for a new model:
 
 ```bash
 python -m  sharktank.evaluate.perplexity \

From 16c100be8347acc300efcdbdb05512babf14c5ca Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 8 Nov 2024 20:44:41 +0000
Subject: [PATCH 06/19] README updates

---
 sharktank/sharktank/evaluate/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
index 448aa60ca..d5895c221 100644
--- a/sharktank/sharktank/evaluate/README.md
+++ b/sharktank/sharktank/evaluate/README.md
@@ -27,7 +27,7 @@ python -m  sharktank.evaluate.perplexity \
   --tokenizer-config-json=tokenizer_config.json
 ```
 
-### LLaMA 3.1 Scoreboard
+### LLaMA 3.1 Perplexity Scoreboard
 
 | CPU            | GPU        |
 |:-------------: |:----------:|

From b5d7b0e64c3ca58b0a9a23a4e6834fed9f622819 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Sat, 9 Nov 2024 03:02:22 +0000
Subject: [PATCH 07/19] Test github pages deployment

---
 .github/workflows/ci_eval.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 66f782407..8f08a68aa 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -7,6 +7,7 @@
 name: CI - Perplexity
 
 on:
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.
@@ -75,13 +76,13 @@ jobs:
             "numpy<2.0"
 
       - name: Run perplexity test with IREE
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_iree.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@v3
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out
+          publish_dir: ./perplexity
 
   test_perplexity_torch:
     timeout-minutes: 1000
@@ -129,10 +130,10 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
       - name: Run perplexity test with Torch
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_torch.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@v3
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out
+          publish_dir: ./perplexity

From 2de02a047b791c3546531bc22f9f27c2d0548e63 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Sat, 9 Nov 2024 03:12:54 +0000
Subject: [PATCH 08/19] README.md updates

---
 sharktank/sharktank/evaluate/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
index d5895c221..d655184ab 100644
--- a/sharktank/sharktank/evaluate/README.md
+++ b/sharktank/sharktank/evaluate/README.md
@@ -27,12 +27,13 @@ python -m  sharktank.evaluate.perplexity \
   --tokenizer-config-json=tokenizer_config.json
 ```
 
-### LLaMA 3.1 Perplexity Scoreboard
+### Perplexity Scoreboard
 
 | CPU            | GPU        |
 |:-------------: |:----------:|
 | AMD EPYC 9554  | MI300X     |
 
+#### LLaMA 3.1
 
 |Models   |Model size (GB) |Torch      |IREE       |
 |:--------|:---------------|:----------|:----------|

From 074bc6674f42e73aa364a46c8847e8dd338de092 Mon Sep 17 00:00:00 2001
From: Archana Ramalingam
 <98564406+archana-ramalingam@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:04:03 -0800
Subject: [PATCH 09/19] Pin actions-gh-pages to latest hash

Co-authored-by: Marius Brehler <marius.brehler@amd.com>
---
 .github/workflows/ci_eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 8f08a68aa..5a0e7537d 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -133,7 +133,7 @@ jobs:
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_torch.html
 
       - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@v3
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
           publish_dir: ./perplexity

From f95023fc97d67740ac7706899347de8e98f3fbfa Mon Sep 17 00:00:00 2001
From: Archana Ramalingam
 <98564406+archana-ramalingam@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:04:25 -0800
Subject: [PATCH 10/19] Pin actions-gh-pages to latest hash

Co-authored-by: Marius Brehler <marius.brehler@amd.com>
---
 .github/workflows/ci_eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 5a0e7537d..6a1ca40ef 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -79,7 +79,7 @@ jobs:
         run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_iree.html
 
       - name: Deploy to GitHub Pages
-        uses: peaceiris/actions-gh-pages@v3
+        uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
           publish_dir: ./perplexity

From a5a1d49a154f173035a923489298e2a598134230 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Fri, 15 Nov 2024 22:30:20 +0000
Subject: [PATCH 11/19] Remove pre-submit debug

---
 .github/workflows/ci_eval.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 8f08a68aa..cf18212df 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -7,7 +7,6 @@
 name: CI - Perplexity
 
 on:
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.

From 8f70b33e49ba5a05feb674f8f1d97c2a0587ae64 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Sat, 16 Nov 2024 05:00:34 +0000
Subject: [PATCH 12/19] Update GH pages dir

---
 .github/workflows/ci_eval.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 999d921c7..96ec5b078 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -74,13 +74,13 @@ jobs:
             iree-base-runtime
 
       - name: Run perplexity test with IREE
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_iree.html
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./perplexity
+          publish_dir: ./iree_numerics
 
   test_perplexity_torch:
     timeout-minutes: 1000
@@ -128,10 +128,10 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
       - name: Run perplexity test with Torch
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_torch.html
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./perplexity
+          publish_dir: ./torch_numerics

From 975e5a06d76501fc8d77898aeaaa19eaa92a1b42 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Sat, 16 Nov 2024 05:02:40 +0000
Subject: [PATCH 13/19] Test github pages deployment

---
 .github/workflows/ci_eval.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 96ec5b078..1e670f511 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -7,6 +7,7 @@
 name: CI - Perplexity
 
 on:
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.

From f39d91e898427f1a6edc18c2474e55dc105cc09d Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 18 Nov 2024 20:44:32 +0000
Subject: [PATCH 14/19] Add keep_files and destination_dir

---
 .github/workflows/ci_eval.yaml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 1e670f511..1eecd6be3 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -75,13 +75,15 @@ jobs:
             iree-base-runtime
 
       - name: Run perplexity test with IREE
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=index.html
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/numerics/iree_perplexity/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./iree_numerics
+          publish_dir: ./out/llm/llama/numerics/iree_perplexity
+          destination_dir: ./llm/llama/numerics/iree_perplexity
+          keep_files: true
 
   test_perplexity_torch:
     timeout-minutes: 1000
@@ -129,10 +131,12 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
       - name: Run perplexity test with Torch
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=index.html
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/numerics/torch_perplexity/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./torch_numerics
+          publish_dir: ./out/llm/llama/numerics/torch_perplexity
+          destination_dir: ./llm/llama/numerics/torch_perplexity
+          keep_files: true

From 72b56dcfcd1fcd042af0a56792489f79e1c6e3e6 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 18 Nov 2024 21:02:44 +0000
Subject: [PATCH 15/19] Test gh-pages deployment

---
 sharktank/sharktank/evaluate/perplexity_iree.py  | 2 +-
 sharktank/sharktank/evaluate/perplexity_torch.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sharktank/sharktank/evaluate/perplexity_iree.py b/sharktank/sharktank/evaluate/perplexity_iree.py
index 4f95ae1bd..5ad5cbbb2 100644
--- a/sharktank/sharktank/evaluate/perplexity_iree.py
+++ b/sharktank/sharktank/evaluate/perplexity_iree.py
@@ -183,7 +183,7 @@ def get_prompts(self):
             s.replace("\n", "").rstrip()
             for s in test_prompts
             if s != "" and len(s.split()) >= 20 and s.count("=") < 2
-        ]
+        ][0:4]
 
         self.bs = len(test_prompts)
 
diff --git a/sharktank/sharktank/evaluate/perplexity_torch.py b/sharktank/sharktank/evaluate/perplexity_torch.py
index fc3aa5fca..4079c4d54 100644
--- a/sharktank/sharktank/evaluate/perplexity_torch.py
+++ b/sharktank/sharktank/evaluate/perplexity_torch.py
@@ -144,7 +144,7 @@ def get_prompts(self):
             s.replace("\n", "").rstrip()
             for s in test_prompts
             if s != "" and len(s.split()) >= 20 and s.count("=") < 2
-        ]
+        ][0:4]
 
         logger.info(f" num_test_prompts: {len(test_prompts)}")
 

From 507bc5dff0df10fbad587d4ddbea8da215e45dcb Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Mon, 18 Nov 2024 22:19:05 +0000
Subject: [PATCH 16/19] Rename numerics to perplexity

---
 .github/workflows/ci_eval.yaml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 1eecd6be3..e874d0247 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -7,7 +7,6 @@
 name: CI - Perplexity
 
 on:
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.
@@ -75,14 +74,14 @@ jobs:
             iree-base-runtime
 
       - name: Run perplexity test with IREE
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/numerics/iree_perplexity/index.html
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/iree_perplexity/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out/llm/llama/numerics/iree_perplexity
-          destination_dir: ./llm/llama/numerics/iree_perplexity
+          publish_dir: ./out/llm/llama/perplexity/iree_perplexity
+          destination_dir: ./llm/llama/perplexity/iree_perplexity
           keep_files: true
 
   test_perplexity_torch:
@@ -131,12 +130,12 @@ jobs:
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
       - name: Run perplexity test with Torch
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/numerics/torch_perplexity/index.html
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/torch_perplexity/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
         with:
           github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
-          publish_dir: ./out/llm/llama/numerics/torch_perplexity
-          destination_dir: ./llm/llama/numerics/torch_perplexity
+          publish_dir: ./out/llm/llama/perplexity/torch_perplexity
+          destination_dir: ./llm/llama/perplexity/torch_perplexity
           keep_files: true

From 750c4b1efcf9edf6eb965b875a035bd5af6a5a2a Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Tue, 19 Nov 2024 03:20:08 +0000
Subject: [PATCH 17/19] Update README.md

---
 sharktank/sharktank/evaluate/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
index d655184ab..beb0281cd 100644
--- a/sharktank/sharktank/evaluate/README.md
+++ b/sharktank/sharktank/evaluate/README.md
@@ -35,6 +35,6 @@ python -m  sharktank.evaluate.perplexity \
 
 #### LLaMA 3.1
 
-|Models   |Model size (GB) |Torch      |IREE       |
-|:--------|:---------------|:----------|:----------|
-|8B f16   |16.07           |14.930181  |14.991893  |
+|Models                 |Model size (GB) |Torch score   |IREE score    |
+|:----------------------|:---------------|:-------------|:-------------|
+|8B FP16 TP1 decomposed |16.07           |14.930181     |14.991893     |

From 06665144926084b7d878f1886289d8a7ff7bcd81 Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Tue, 19 Nov 2024 04:27:37 +0000
Subject: [PATCH 18/19] Test gh pages deployment

---
 .github/workflows/ci_eval.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index e874d0247..8880f0256 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -7,6 +7,7 @@
 name: CI - Perplexity
 
 on:
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.

From 6e83d0e57c88f92b98af7ccce85d3ff727165e0d Mon Sep 17 00:00:00 2001
From: archana-ramalingam <archana.ramalingam@amd.com>
Date: Wed, 20 Nov 2024 08:54:04 +0000
Subject: [PATCH 19/19] Revert debug changes

---
 .github/workflows/ci_eval.yaml                |  1 -
 .../sharktank/evaluate/perplexity_iree.py     | 20 +++++++++++++------
 .../sharktank/evaluate/perplexity_torch.py    | 20 +++++++++++++------
 sharktank/sharktank/utils/export_artifacts.py | 18 ++++++++++++-----
 4 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 8880f0256..e874d0247 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -7,7 +7,6 @@
 name: CI - Perplexity
 
 on:
-  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.
diff --git a/sharktank/sharktank/evaluate/perplexity_iree.py b/sharktank/sharktank/evaluate/perplexity_iree.py
index 5ad5cbbb2..9701bed34 100644
--- a/sharktank/sharktank/evaluate/perplexity_iree.py
+++ b/sharktank/sharktank/evaluate/perplexity_iree.py
@@ -9,6 +9,7 @@
 import json
 import time
 import random
+import re
 from datetime import timedelta
 from tqdm import tqdm
 
@@ -83,11 +84,18 @@ def wrapper(*args, **kwargs):
             start = time.time()
             result = func(*args, **kwargs)
             end = time.time()
-            seconds = end - start
-            time_taken = abs(timedelta(seconds=round(seconds)))
-
-            if seconds < 1:
-                time_taken = f" {seconds * 1000} ms"
+            total_seconds = end - start
+            time_taken = abs(timedelta(seconds=total_seconds))
+            hours, minutes, seconds = re.split(":", str(time_taken))
+
+            if total_seconds < 1:
+                time_taken = f" {round(total_seconds * 1000, 3)} ms"
+            elif total_seconds < 60:
+                time_taken = "{:.2f} secs".format(round(float(total_seconds), 2))
+            else:
+                time_taken = "{:02d} hrs : {:02d} mins : {:.2f} secs".format(
+                    int(hours), int(minutes), round(float(seconds), 2)
+                )
 
             func_name = func.__name__
             if func_name == "get_perplexity":
@@ -183,7 +191,7 @@ def get_prompts(self):
             s.replace("\n", "").rstrip()
             for s in test_prompts
             if s != "" and len(s.split()) >= 20 and s.count("=") < 2
-        ][0:4]
+        ]
 
         self.bs = len(test_prompts)
 
diff --git a/sharktank/sharktank/evaluate/perplexity_torch.py b/sharktank/sharktank/evaluate/perplexity_torch.py
index 4079c4d54..da5fc104a 100644
--- a/sharktank/sharktank/evaluate/perplexity_torch.py
+++ b/sharktank/sharktank/evaluate/perplexity_torch.py
@@ -8,6 +8,7 @@
 import logging
 import time
 import random
+import re
 from datetime import timedelta
 import json
 import numpy as np
@@ -69,11 +70,18 @@ def wrapper(*args, **kwargs):
             start = time.time()
             result = func(*args, **kwargs)
             end = time.time()
-            seconds = end - start
-            time_taken = abs(timedelta(seconds=round(seconds)))
-
-            if seconds < 1:
-                time_taken = f" {seconds * 1000} ms"
+            total_seconds = end - start
+            time_taken = abs(timedelta(seconds=total_seconds))
+            hours, minutes, seconds = re.split(":", str(time_taken))
+
+            if total_seconds < 1:
+                time_taken = f" {round(total_seconds * 1000, 3)} ms"
+            elif total_seconds < 60:
+                time_taken = "{:.2f} secs".format(round(float(total_seconds), 2))
+            else:
+                time_taken = "{:02d} hrs : {:02d} mins : {:.2f} secs".format(
+                    int(hours), int(minutes), round(float(seconds), 2)
+                )
 
             func_name = func.__name__
             if func_name == "get_perplexity":
@@ -144,7 +152,7 @@ def get_prompts(self):
             s.replace("\n", "").rstrip()
             for s in test_prompts
             if s != "" and len(s.split()) >= 20 and s.count("=") < 2
-        ][0:4]
+        ]
 
         logger.info(f" num_test_prompts: {len(test_prompts)}")
 
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 9deade56c..b057730c7 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -9,6 +9,7 @@
 import subprocess
 import logging
 import time
+import re
 from pathlib import Path
 from datetime import timedelta
 from typing import List, Optional
@@ -107,11 +108,18 @@ def wrapper(*args, **kwargs):
             start = time.time()
             result = func(*args, **kwargs)
             end = time.time()
-            seconds = end - start
-            time_taken = abs(timedelta(seconds=round(seconds)))
-
-            if seconds < 1:
-                time_taken = f" {seconds * 1000} ms"
+            total_seconds = end - start
+            time_taken = abs(timedelta(seconds=total_seconds))
+            hours, minutes, seconds = re.split(":", str(time_taken))
+
+            if total_seconds < 1:
+                time_taken = f" {round(total_seconds * 1000, 3)} ms"
+            elif total_seconds < 60:
+                time_taken = "{:.2f} secs".format(round(float(total_seconds), 2))
+            else:
+                time_taken = "{:02d} hrs : {:02d} mins : {:.2f} secs".format(
+                    int(hours), int(minutes), round(float(seconds), 2)
+                )
 
             func_name = func.__name__
             logger.info(f" {func_name}: {time_taken}")