From 37c4552081d9d5820502431703d6af71b98c93a5 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 25 Dec 2024 16:55:03 +0800
Subject: [PATCH] fix woq example and update document for v1.19.0 (#2097)

Signed-off-by: xin3he <xin3.he@intel.com>
---
 .azure-pipelines/template/docker-template.yml              | 4 ++--
 README.md                                                  | 2 +-
 .../weight_only/requirements-autoround-hpu.txt             | 5 ++---
 .../quantization/weight_only/run_benchmark.sh              | 4 +++-
 .../quantization/weight_only/run_clm_no_trainer.py         | 7 ++++---
 neural_compressor/evaluation/lm_eval/models/huggingface.py | 2 ++
 test/3x/torch/requirements.txt                             | 2 +-
 7 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml
index 3911b2da487..5944aa01c4c 100644
--- a/.azure-pipelines/template/docker-template.yml
+++ b/.azure-pipelines/template/docker-template.yml
@@ -74,7 +74,7 @@ steps:
 
   - ${{ if eq(parameters.imageSource, 'pull') }}:
       - script: |
-            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+            docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
         displayName: "Pull habana docker image"
 
   - script: |
@@ -95,7 +95,7 @@ steps:
             else
                 docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
                 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
-                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
             fi
             echo "Show the container list after docker run ... "
             docker ps -a
diff --git a/README.md b/README.md
index 1778cd75c11..24ce80e85fb 100644
--- a/README.md
+++ b/README.md
@@ -78,7 +78,7 @@ Following example code demonstrates FP8 Quantization, it is supported by Intel G
 To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
 ```bash
 # Run a container with an interactive shell
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
 ```
 Run the example:
 ```python
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt
index 746014bae8e..ccba01422a5 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt
@@ -11,6 +11,5 @@ lm_eval==0.4.3
 peft
 numba
 tbb
-# TODO: (Yi) SW-208079 replace auto-round with the released version
-auto-round-hpu @ git+https://github.com/intel/auto-round.git@hpu_only_pkg
-optimum-habana==1.14.1
\ No newline at end of file
+auto-round @ git+https://github.com/intel/auto-round.git@v0.4.2
+optimum-habana==1.14.1
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index bafa3ba5062..10b22200914 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -14,6 +14,7 @@ function init_params {
   batch_size=16
   tuned_checkpoint=saved_results
   task=lambada_openai
+  incbench_cmd="incbench --num_cores_per_instance 4"
   echo ${max_eval_samples}
   for var in "$@"
   do
@@ -104,6 +105,7 @@ function run_benchmark {
     elif [ "${topology}" = "opt_125m_woq_autoround_int4_hpu" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo AutoRound"
+        incbench_cmd="incbench --num_instances 1"
     elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
         model_name_or_path="facebook/opt-125m"
     fi
@@ -116,7 +118,7 @@ function run_benchmark {
             --batch_size ${batch_size} \
             ${extra_cmd} ${mode_cmd}
     elif [[ ${mode} == "performance" ]]; then
-        incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
+        ${incbench_cmd} run_clm_no_trainer.py \
             --model ${model_name_or_path} \
             --batch_size ${batch_size} \
             --output_dir ${tuned_checkpoint} \
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index aa23c649a7b..2d29c97b586 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -270,8 +270,9 @@ def get_user_model():
         torchscript = True
     if args.woq_algo == "AutoRound" and is_habana_framework_installed():
         print("Quantizing model with AutoRound on HPU")
-        check_torch_compile_with_hpu_backend()
-        set_envs_for_torch_compile_with_hpu_backend()
+        if args.quantize:
+            check_torch_compile_with_hpu_backend()
+            set_envs_for_torch_compile_with_hpu_backend()
         user_model = AutoModelForCausalLM.from_pretrained(
             args.model,
             trust_remote_code=args.trust_remote_code,
@@ -570,7 +571,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
 
 
 if is_hpex_available():
-    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    from habana_frameworks.torch.hpu.graphs import wrap_in_hpu_graph
     user_model = user_model.to(torch.bfloat16)
     wrap_in_hpu_graph(user_model, max_graphs=10)
 
diff --git a/neural_compressor/evaluation/lm_eval/models/huggingface.py b/neural_compressor/evaluation/lm_eval/models/huggingface.py
index dd7e8466ef5..1f79f577416 100644
--- a/neural_compressor/evaluation/lm_eval/models/huggingface.py
+++ b/neural_compressor/evaluation/lm_eval/models/huggingface.py
@@ -969,6 +969,8 @@ def _model_call(self, inps, attn_mask=None, labels=None):
                         output = output.logits
                     if self.pad_to_buckets and padding_length != 0:  # use buckets to pad inputs
                         output = output[:, :-padding_length, :]
+                    if "hpu" in output.device.type:  # make sure return fp32 tensor for HPU, TODO: root cause
+                        output = output.to(torch.float32)
                 return output
 
     def _model_generate(self, context, max_length, stop, **generation_kwargs):
diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index 5b97060f9f8..344d67ed603 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -1,5 +1,5 @@
 auto_round
-deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
+deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
 expecttest
 intel_extension_for_pytorch
 numpy