From abc587c8093e9771fbbf777de775cf70045328d9 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 31 Jan 2025 09:17:46 +0100
Subject: [PATCH 1/6] remove device_map restriction

---
 optimum_benchmark/backends/pytorch/config.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py
index ec48f639..61f9dfc0 100644
--- a/optimum_benchmark/backends/pytorch/config.py
+++ b/optimum_benchmark/backends/pytorch/config.py
@@ -5,7 +5,6 @@
 from ...system_utils import is_rocm_system
 from ..config import BackendConfig
 
-DEVICE_MAPS = ["auto", "sequential"]
 AMP_DTYPES = ["bfloat16", "float16"]
 TORCH_DTYPES = ["bfloat16", "float16", "float32", "auto"]
 
@@ -60,9 +59,6 @@ def __post_init__(self):
                 "Please remove it from the `model_kwargs` and set it in the backend config directly."
             )
 
-        if self.device_map is not None and self.device_map not in DEVICE_MAPS:
-            raise ValueError(f"`device_map` must be one of {DEVICE_MAPS}. Got {self.device_map} instead.")
-
         if self.torch_dtype is not None and self.torch_dtype not in TORCH_DTYPES:
             raise ValueError(f"`torch_dtype` must be one of {TORCH_DTYPES}. Got {self.torch_dtype} instead.")
 

From 1c5e33e0711e4b0035629cef9dc4ab6396917fd2 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 31 Jan 2025 09:18:43 +0100
Subject: [PATCH 2/6] use auto quantization dispatcher

---
 optimum_benchmark/backends/pytorch/backend.py | 56 ++++---------------
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index ea8aa0a1..5b245e81 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -8,15 +8,12 @@
 from datasets import Dataset
 from safetensors.torch import save_file
 from transformers import (
-    AwqConfig,
-    BitsAndBytesConfig,
-    GPTQConfig,
-    TorchAoConfig,
     Trainer,
     TrainerCallback,
     TrainerState,
     TrainingArguments,
 )
+from transformers.quantizers import AutoQuantizationConfig
 
 from ...import_utils import is_deepspeed_available, is_torch_distributed_available, is_zentorch_available
 from ..base import Backend
@@ -286,8 +283,6 @@ def create_no_weights_model(self) -> None:
 
     def process_quantization_config(self) -> None:
         if self.is_gptq_quantized:
-            self.logger.info("\t+ Processing GPTQ config")
-
             try:
                 import exllamav2_kernels  # noqa: F401
             except ImportError:
@@ -299,12 +294,7 @@ def process_quantization_config(self) -> None:
                     "`optimum-benchmark` repository at `https://github.com/huggingface/optimum-benchmark`."
                 )
 
-            self.quantization_config = GPTQConfig(
-                **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
-            )
         elif self.is_awq_quantized:
-            self.logger.info("\t+ Processing AWQ config")
-
             try:
                 import exlv2_ext  # noqa: F401
             except ImportError:
@@ -316,21 +306,10 @@ def process_quantization_config(self) -> None:
                     "`optimum-benchmark` repository at `https://github.com/huggingface/optimum-benchmark`."
                 )
 
-            self.quantization_config = AwqConfig(
-                **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
-            )
-        elif self.is_bnb_quantized:
-            self.logger.info("\t+ Processing BitsAndBytes config")
-            self.quantization_config = BitsAndBytesConfig(
-                **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
-            )
-        elif self.is_torchao_quantized:
-            self.logger.info("\t+ Processing TorchAO config")
-            self.quantization_config = TorchAoConfig(
-                **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
-            )
-        else:
-            raise ValueError(f"Quantization scheme {self.config.quantization_scheme} not recognized")
+        self.logger.info("\t+ Processing AutoQuantization config")
+        self.quantization_config = AutoQuantizationConfig.from_dict(
+            getattr(self.pretrained_config, "quantization_config", {}).update(self.config.quantization_config)
+        )
 
     @property
     def is_quantized(self) -> bool:
@@ -339,13 +318,6 @@ def is_quantized(self) -> bool:
             and self.pretrained_config.quantization_config.get("quant_method", None) is not None
         )
 
-    @property
-    def is_bnb_quantized(self) -> bool:
-        return self.config.quantization_scheme == "bnb" or (
-            hasattr(self.pretrained_config, "quantization_config")
-            and self.pretrained_config.quantization_config.get("quant_method", None) == "bnb"
-        )
-
     @property
     def is_gptq_quantized(self) -> bool:
         return self.config.quantization_scheme == "gptq" or (
@@ -360,13 +332,6 @@ def is_awq_quantized(self) -> bool:
             and self.pretrained_config.quantization_config.get("quant_method", None) == "awq"
         )
 
-    @property
-    def is_torchao_quantized(self) -> bool:
-        return self.config.quantization_scheme == "torchao" or (
-            hasattr(self.pretrained_config, "quantization_config")
-            and self.pretrained_config.quantization_config.get("quant_method", None) == "torchao"
-        )
-
     @property
     def is_exllamav2(self) -> bool:
         return (
@@ -390,7 +355,10 @@ def automodel_kwargs(self) -> Dict[str, Any]:
         kwargs = {}
 
         if self.config.torch_dtype is not None:
-            kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype)
+            if hasattr(torch, self.config.torch_dtype):
+                kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype)
+            else:
+                kwargs["torch_dtype"] = self.config.torch_dtype
 
         if self.is_quantized:
             kwargs["quantization_config"] = self.quantization_config
@@ -436,9 +404,9 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict
 
     @torch.inference_mode()
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
-        assert kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1, (
-            "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
-        )
+        assert (
+            kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1
+        ), "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
         return self.pretrained_model.generate(**inputs, **kwargs)
 
     @torch.inference_mode()

From 66a7bd170bd8d8b590d48d4f8eca473e2a896f90 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 31 Jan 2025 09:22:14 +0100
Subject: [PATCH 3/6] style

---
 optimum_benchmark/backends/pytorch/backend.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index 5b245e81..51070d8b 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -404,9 +404,9 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict
 
     @torch.inference_mode()
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
-        assert (
-            kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1
-        ), "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
+        assert kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1, (
+            "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
+        )
         return self.pretrained_model.generate(**inputs, **kwargs)
 
     @torch.inference_mode()

From 660e70c8c376fcc8b2d2a0ea6e4357002a60a19e Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 31 Jan 2025 11:42:18 +0100
Subject: [PATCH 4/6] fix

---
 optimum_benchmark/backends/pytorch/backend.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index 51070d8b..07e45980 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -308,7 +308,7 @@ def process_quantization_config(self) -> None:
 
         self.logger.info("\t+ Processing AutoQuantization config")
         self.quantization_config = AutoQuantizationConfig.from_dict(
-            getattr(self.pretrained_config, "quantization_config", {}).update(self.config.quantization_config)
+            (getattr(self.pretrained_config, "quantization_config") or {}).update(self.config.quantization_config)
         )
 
     @property
@@ -404,9 +404,9 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict
 
     @torch.inference_mode()
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
-        assert kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1, (
-            "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
-        )
+        assert (
+            kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1
+        ), "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
         return self.pretrained_model.generate(**inputs, **kwargs)
 
     @torch.inference_mode()

From 27d4efb59618d715a1e443f129ce75e2ea3d6a49 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 31 Jan 2025 11:55:10 +0100
Subject: [PATCH 5/6] fix

---
 optimum_benchmark/backends/pytorch/backend.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index 07e45980..bb7212f7 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -308,28 +308,28 @@ def process_quantization_config(self) -> None:
 
         self.logger.info("\t+ Processing AutoQuantization config")
         self.quantization_config = AutoQuantizationConfig.from_dict(
-            (getattr(self.pretrained_config, "quantization_config") or {}).update(self.config.quantization_config)
+            dict(**getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
         )
 
     @property
     def is_quantized(self) -> bool:
         return self.config.quantization_scheme is not None or (
             hasattr(self.pretrained_config, "quantization_config")
-            and self.pretrained_config.quantization_config.get("quant_method", None) is not None
+            and self.pretrained_config.quantization_config.get("quant_method") is not None
         )
 
     @property
     def is_gptq_quantized(self) -> bool:
         return self.config.quantization_scheme == "gptq" or (
             hasattr(self.pretrained_config, "quantization_config")
-            and self.pretrained_config.quantization_config.get("quant_method", None) == "gptq"
+            and self.pretrained_config.quantization_config.get("quant_method") == "gptq"
         )
 
     @property
     def is_awq_quantized(self) -> bool:
         return self.config.quantization_scheme == "awq" or (
             hasattr(self.pretrained_config, "quantization_config")
-            and self.pretrained_config.quantization_config.get("quant_method", None) == "awq"
+            and self.pretrained_config.quantization_config.get("quant_method") == "awq"
         )
 
     @property
@@ -341,11 +341,11 @@ def is_exllamav2(self) -> bool:
                 (
                     hasattr(self.pretrained_config, "quantization_config")
                     and hasattr(self.pretrained_config.quantization_config, "exllama_config")
-                    and self.pretrained_config.quantization_config.exllama_config.get("version", None) == 2
+                    and self.pretrained_config.quantization_config.exllama_config.get("version") == 2
                 )
                 or (
                     "exllama_config" in self.config.quantization_config
-                    and self.config.quantization_config["exllama_config"].get("version", None) == 2
+                    and self.config.quantization_config["exllama_config"].get("version") == 2
                 )
             )
         )

From eaf25fdfa07a826502b3cc5f8e4302f81287f580 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 31 Jan 2025 12:20:05 +0100
Subject: [PATCH 6/6] fix

---
 optimum_benchmark/backends/pytorch/backend.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index bb7212f7..dd11ddfd 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -308,7 +308,7 @@ def process_quantization_config(self) -> None:
 
         self.logger.info("\t+ Processing AutoQuantization config")
         self.quantization_config = AutoQuantizationConfig.from_dict(
-            dict(**getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
+            dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
         )
 
     @property
@@ -404,9 +404,9 @@ def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict
 
     @torch.inference_mode()
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
-        assert (
-            kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1
-        ), "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
+        assert kwargs.get("max_new_tokens") == kwargs.get("min_new_tokens") == 1, (
+            "For prefilling, max_new_tokens and min_new_tokens must be equal to 1"
+        )
         return self.pretrained_model.generate(**inputs, **kwargs)
 
     @torch.inference_mode()