nod-ai · archana-ramalingam · Nov 22, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
@@ -7,6 +7,7 @@
 name: CI - Perplexity
 
 on:
+  pull_request:
   workflow_dispatch:
   schedule:
     # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.
@@ -21,9 +22,9 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test_perplexity_vmfb:
+  test_perplexity_iree:
     timeout-minutes: 1000
-    name: "IREE/vmfb"
+    name: "Perplexity-IREE"
     strategy:
       matrix:
         version: [3.11]
@@ -74,12 +75,18 @@ jobs:
             iree-base-runtime \
             "numpy<2.0"
 
-      - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+      - name: Run perplexity test with IREE
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_iree.html
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./perplexity
 
   test_perplexity_torch:
     timeout-minutes: 1000
-    name: "Torch/eager mode"
+    name: "Perplexity-Torch"
     strategy:
       matrix:
         version: [3.11]
@@ -122,5 +129,11 @@ jobs:
           pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
             -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
 
-      - name: Run perplexity test in eager mode
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+      - name: Run perplexity test with Torch
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=perplexity/perplexity_torch.html
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }}
+          publish_dir: ./perplexity
diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
@@ -9,16 +9,31 @@ pip install -r sharktank/requirements-tests.txt
 
 ### Perplexity
 
-Test perplexity for Llama3.1 8B & 405B (FP16 & FP8) models:
+Perplexity score measures the ability of a language model to predict the next token in a sequence. A lower score indicates that a model has higher certainty in it's predictions. Perplexity acts as an intrinsic evaluation metric that measures the model quality, independent of any downstream task.
+
+In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts randomly selected from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
+
+* Test perplexity for Llama3.1 8B (FP16) model:
 
 ```bash
 pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
 ```
 
-Get perplexity for a new model:
+* Calculate perplexity for a new model:
 
 ```bash
 python -m  sharktank.evaluate.perplexity \
   --gguf-file=llama3_70b_f16.gguf \
   --tokenizer-config-json=tokenizer_config.json
 ```
+
+### LLaMA 3.1 Perplexity Scoreboard
+
+| CPU            | GPU        |
+|:-------------: |:----------:|
+| AMD EPYC 9554  | MI300X     |
+
+
+|Models   |Model size (GB) |Torch      |IREE       |
+|:--------|:---------------|:----------|:----------|
+|8B f16   |16.07           |14.930181  |14.991893  |
diff --git a/...ank/sharktank/evaluate/perplexity_vmfb.py → ...ank/sharktank/evaluate/perplexity_iree.py b/...ank/sharktank/evaluate/perplexity_vmfb.py → ...ank/sharktank/evaluate/perplexity_iree.py
diff --git a/sharktank/tests/evaluate/baseline_perplexity_scores.json b/sharktank/tests/evaluate/baseline_perplexity_scores.json
@@ -210,7 +210,7 @@
     ],
     "mean_perplexity": 6.060831
   },
-  "llama3_8B_f16_decomposed_vmfb": {
+  "llama3_8B_f16_decomposed_iree": {
     "perplexities": [
       6.651368,
       22.059452,

diff --git a/...nk/tests/evaluate/perplexity_vmfb_test.py → ...nk/tests/evaluate/perplexity_iree_test.py b/...nk/tests/evaluate/perplexity_vmfb_test.py → ...nk/tests/evaluate/perplexity_iree_test.py
@@ -8,7 +8,7 @@
 import pytest
 import json
 
-from sharktank.evaluate import perplexity_vmfb
+from sharktank.evaluate import perplexity_iree
 
 longrun = pytest.mark.skipif("not config.getoption('longrun')")
 
@@ -32,10 +32,10 @@ def test_llama3_8B_f16_decomposed(self):
 
         # Llama 3.1 8B decomposed
 
-        model_name = "llama3_8B_f16_decomposed_vmfb"
+        model_name = "llama3_8B_f16_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -67,10 +67,10 @@ def test_llama3_8B_f16(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_f16_vmfb"
+        model_name = "llama3_8B_f16_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -102,10 +102,10 @@ def test_llama3_8B_fp8_decomposed(self):
 
         # Llama 3.1 8B decomposed
 
-        model_name = "llama3_8B_fp8_decomposed_vmfb"
+        model_name = "llama3_8B_fp8_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -137,10 +137,10 @@ def test_llama3_8B_fp8(self):
 
         # Llama 3.1 8B non-decomposed
 
-        model_name = "llama3_8B_fp8_vmfb"
+        model_name = "llama3_8B_fp8_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -172,10 +172,10 @@ def test_llama3_405B_f16_decomposed(self):
 
         # Llama 3.1 405B decomposed
 
-        model_name = "llama3_405B_f16_decomposed_vmfb"
+        model_name = "llama3_405B_f16_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -207,10 +207,10 @@ def test_llama3_405B_f16(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_f16_vmfb"
+        model_name = "llama3_405B_f16_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -242,10 +242,10 @@ def test_llama3_405B_fp8_decomposed(self):
 
         # Llama 3.1 405B decomposed
 
-        model_name = "llama3_405B_fp8_decomposed_vmfb"
+        model_name = "llama3_405B_fp8_decomposed_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -277,10 +277,10 @@ def test_llama3_405B_fp8(self):
 
         # Llama 3.1 405B non-decomposed
 
-        model_name = "llama3_405B_fp8_vmfb"
+        model_name = "llama3_405B_fp8_iree"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",