From 37c4552081d9d5820502431703d6af71b98c93a5 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 25 Dec 2024 16:55:03 +0800 Subject: [PATCH] fix woq example and update document for v1.19.0 (#2097) Signed-off-by: xin3he --- .azure-pipelines/template/docker-template.yml | 4 ++-- README.md | 2 +- .../weight_only/requirements-autoround-hpu.txt | 5 ++--- .../quantization/weight_only/run_benchmark.sh | 4 +++- .../quantization/weight_only/run_clm_no_trainer.py | 7 ++++--- neural_compressor/evaluation/lm_eval/models/huggingface.py | 2 ++ test/3x/torch/requirements.txt | 2 +- 7 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml index 3911b2da487..5944aa01c4c 100644 --- a/.azure-pipelines/template/docker-template.yml +++ b/.azure-pipelines/template/docker-template.yml @@ -74,7 +74,7 @@ steps: - ${{ if eq(parameters.imageSource, 'pull') }}: - script: | - docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest displayName: "Pull habana docker image" - script: | @@ -95,7 +95,7 @@ steps: else docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \ --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \ - -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest fi echo "Show the container list after docker run ... " docker ps -a diff --git a/README.md b/README.md index 1778cd75c11..24ce80e85fb 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ Following example code demonstrates FP8 Quantization, it is supported by Intel G To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built). ```bash # Run a container with an interactive shell -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest ``` Run the example: ```python diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt index 746014bae8e..ccba01422a5 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements-autoround-hpu.txt @@ -11,6 +11,5 @@ lm_eval==0.4.3 peft numba tbb -# TODO: (Yi) SW-208079 replace auto-round with the released version -auto-round-hpu @ git+https://github.com/intel/auto-round.git@hpu_only_pkg -optimum-habana==1.14.1 \ No newline at end of file +auto-round @ git+https://github.com/intel/auto-round.git@v0.4.2 +optimum-habana==1.14.1 diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index bafa3ba5062..10b22200914 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -14,6 +14,7 @@ function init_params { batch_size=16 tuned_checkpoint=saved_results task=lambada_openai + incbench_cmd="incbench --num_cores_per_instance 4" echo ${max_eval_samples} for var in "$@" do @@ -104,6 +105,7 @@ function run_benchmark { elif [ "${topology}" = "opt_125m_woq_autoround_int4_hpu" ]; then model_name_or_path="facebook/opt-125m" extra_cmd=$extra_cmd" --woq_algo AutoRound" + incbench_cmd="incbench --num_instances 1" elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then model_name_or_path="facebook/opt-125m" fi @@ -116,7 +118,7 @@ function run_benchmark { --batch_size ${batch_size} \ ${extra_cmd} ${mode_cmd} elif [[ ${mode} == "performance" ]]; then - incbench --num_cores_per_instance 4 run_clm_no_trainer.py \ + ${incbench_cmd} run_clm_no_trainer.py \ --model ${model_name_or_path} \ --batch_size ${batch_size} \ --output_dir ${tuned_checkpoint} \ diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index aa23c649a7b..2d29c97b586 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -270,8 +270,9 @@ def get_user_model(): torchscript = True if args.woq_algo == "AutoRound" and is_habana_framework_installed(): print("Quantizing model with AutoRound on HPU") - check_torch_compile_with_hpu_backend() - set_envs_for_torch_compile_with_hpu_backend() + if args.quantize: + check_torch_compile_with_hpu_backend() + set_envs_for_torch_compile_with_hpu_backend() user_model = AutoModelForCausalLM.from_pretrained( args.model, trust_remote_code=args.trust_remote_code, @@ -570,7 +571,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): if is_hpex_available(): - from habana_frameworks.torch.hpu import wrap_in_hpu_graph + from habana_frameworks.torch.hpu.graphs import wrap_in_hpu_graph user_model = user_model.to(torch.bfloat16) wrap_in_hpu_graph(user_model, max_graphs=10) diff --git a/neural_compressor/evaluation/lm_eval/models/huggingface.py b/neural_compressor/evaluation/lm_eval/models/huggingface.py index dd7e8466ef5..1f79f577416 100644 --- a/neural_compressor/evaluation/lm_eval/models/huggingface.py +++ b/neural_compressor/evaluation/lm_eval/models/huggingface.py @@ -969,6 +969,8 @@ def _model_call(self, inps, attn_mask=None, labels=None): output = output.logits if self.pad_to_buckets and padding_length != 0: # use buckets to pad inputs output = output[:, :-padding_length, :] + if "hpu" in output.device.type: # make sure return fp32 tensor for HPU, TODO: root cause + output = output.to(torch.float32) return output def _model_generate(self, context, max_length, stop, **generation_kwargs): diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 5b97060f9f8..344d67ed603 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,5 +1,5 @@ auto_round -deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0 +deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 expecttest intel_extension_for_pytorch numpy